Total coverage: 368732 (19%)of 2013570
9 14 8 2 6 9 8 8 8 8 8 8 5 8 3 2 1 1 1 1 1 3 2 7 5 7 6 6 6 5 5 5 4 4 4 3 2 4 6 2 6 3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/inet_sock.h> #include <net/raw.h> #include <net/rawv6.h> #ifdef pr_fmt # undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt static struct raw_hashinfo * raw_get_hashinfo(const struct inet_diag_req_v2 *r) { if (r->sdiag_family == AF_INET) { return &raw_v4_hashinfo; #if IS_ENABLED(CONFIG_IPV6) } else if (r->sdiag_family == AF_INET6) { return &raw_v6_hashinfo; #endif } else { return ERR_PTR(-EINVAL); } } /* * Due to requirement of not breaking user API we can't simply * rename @pad field in inet_diag_req_v2 structure, instead * use helper to figure it out. */ static bool raw_lookup(struct net *net, const struct sock *sk, const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; if (r->sdiag_family == AF_INET) return raw_v4_match(net, sk, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else return raw_v6_match(net, sk, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, r->id.idiag_if, 0); #endif return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct hlist_head *hlist; struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill * diag message to be reported, so * caller should call sock_put then. */ if (refcount_inc_not_zero(&sk->sk_refcnt)) goto out_unlock; } } } sk = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; struct net *net; int err; net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); return -ENOMEM; } err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); if (err < 0) { kfree_skb(rep); return err; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); return err; } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, bool net_admin) { if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; if (IS_ERR(hashinfo)) return; s_slot = cb->args[0]; num = s_num = cb->args[1]; rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) goto out_unlock; next: num++; } } out_unlock: rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; } static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int raw_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *r) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } #endif static const struct inet_diag_handler raw_diag_handler = { .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, .idiag_type = IPPROTO_RAW, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = raw_diag_destroy, #endif }; static void __always_unused __check_inet_diag_req_raw(void) { /* * Make sure the two structures are identical, * except the @pad field. */ #define __offset_mismatch(m1, m2) \ (offsetof(struct inet_diag_req_v2, m1) != \ offsetof(struct inet_diag_req_raw, m2)) BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != sizeof(struct inet_diag_req_raw)); BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); BUILD_BUG_ON(__offset_mismatch(id, id)); #undef __offset_mismatch } static int __init raw_diag_init(void) { return inet_diag_register(&raw_diag_handler); } static void __exit raw_diag_exit(void) { inet_diag_unregister(&raw_diag_handler); } module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
14 3 4 4 14 1 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 1 11 11 11 8 8 8 8 8 8 8 8 1 8 11 11 11 3 2 2 8 8 8 8 8 8 8 8 7 8 8 8 8 8 8 8 2 8 8 8 8 8 8 8 8 1 1 1 1 1 1 1 1 8 8 8 8 8 8 8 8 3 3 3 3 3 3 3 3 3 3 3 6 6 6 5 5 5 6 6 1 5 6 4 6 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 1 1 2 2 2 2 2 3 5 5 5 13 13 41 41 1 41 41 41 41 3 1 3 14 14 14 14 3 3 14 14 3 1 1 1 1 6 6 6 6 3 1 1 1 1 6 5 24 24 24 8 8 1 63 42 42 72 4 1 3 2 69 4 1 67 2 66 3 1 65 6 1 64 3 1 72 1 1 1 14 12 5 14 3 14 14 14 14 14 3 3 14 3 14 14 14 14 14 14 1 14 14 14 1 14 14 14 3 14 5 12 14 1 13 14 12 2 14 1 12 51 34 44 44 44 42 40 31 51 60 60 43 6 11 10 1 60 2 7 7 1 1 6 1 5 51 1 50 1 55 4 1 3 51 51 2 49 1 48 48 51 48 60 31 42 42 42 2 42 41 3 39 3 2 40 1 42 3 42 39 42 60 59 60 41 2 2 2 39 41 8 41 16 29 25 12 29 63 63 2 63 61 7 56 4 55 2 3 60 4 4 63 4 63 1 62 1 63 63 1 63 1 63 3 60 60 3 63 3 60 4 60 4 59 2 61 4 5 58 4 59 2 4 59 2 3 60 1 3 1 62 2 56 63 2 61 5 58 3 60 3 63 63 63 2 1 2 1 1 1 1 61 62 2 62 2 2 60 2 62 4 2 63 63 63 3 3 3 69 74 74 25 25 8 73 74 2 2 2 74 74 74 73 74 74 74 74 74 74 73 74 73 74 74 74 74 74 74 74 74 73 74 74 74 74 1 74 74 74 466 300 300 467 1996 467 1990 23 1994 1 1997 267 267 9 9 9 9 21 9 9 99 99 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 // SPDX-License-Identifier: GPL-2.0-only /* * VXLAN: Virtual eXtensible Local Area Network * * Copyright (c) 2012-2013 Vyatta Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/if_ether.h> #include <linux/ethtool.h> #include <linux/rhashtable.h> #include <net/arp.h> #include <net/ndisc.h> #include <net/gro.h> #include <net/ipv6_stubs.h> #include <net/ip.h> #include <net/icmp.h> #include <net/rtnetlink.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/tun_proto.h> #include <net/vxlan.h> #include <net/nexthop.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #endif #include "vxlan_private.h" #define VXLAN_VERSION "0.1" #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 * for compatibility with early adopters. */ static unsigned short vxlan_port __read_mostly = 8472; module_param_named(udp_port, vxlan_port, ushort, 0444); MODULE_PARM_DESC(udp_port, "Destination UDP port"); static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); unsigned int vxlan_net_id; const u8 all_zeros_mac[ETH_ALEN + 2]; static struct rtnl_link_ops vxlan_link_ops; static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); static const struct rhashtable_params vxlan_fdb_rht_params = { .head_offset = offsetof(struct vxlan_fdb, rhnode), .key_offset = offsetof(struct vxlan_fdb, key), .key_len = sizeof(struct vxlan_fdb_key), .automatic_shrinking = true, }; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { return vs->flags & VXLAN_F_COLLECT_METADATA || ip_tunnel_collect_metadata(); } /* Find VXLAN socket based on network namespace, address family, UDP port, * enabled unshareable flags and socket device binding (see l3mdev with * non-default VRF). */ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, __be16 port, u32 flags, int ifindex) { struct vxlan_sock *vs; flags &= VXLAN_F_RCV_FLAGS; hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && vxlan_get_sk_family(vs) == family && vs->flags == flags && vs->sock->sk->sk_bound_dev_if == ifindex) return vs; } return NULL; } static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, __be32 vni, struct vxlan_vni_node **vninode) { struct vxlan_vni_node *vnode; struct vxlan_dev_node *node; /* For flow based devices, map all packets to VNI 0 */ if (vs->flags & VXLAN_F_COLLECT_METADATA && !(vs->flags & VXLAN_F_VNIFILTER)) vni = 0; hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) { if (!node->vxlan) continue; vnode = NULL; if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) { vnode = vxlan_vnifilter_lookup(node->vxlan, vni); if (!vnode) continue; } else if (node->vxlan->default_dst.remote_vni != vni) { continue; } if (IS_ENABLED(CONFIG_IPV6)) { const struct vxlan_config *cfg = &node->vxlan->cfg; if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) && cfg->remote_ifindex != ifindex) continue; } if (vninode) *vninode = vnode; return node->vxlan; } return NULL; } /* Look up VNI in a per net namespace table */ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex, __be32 vni, sa_family_t family, __be16 port, u32 flags) { struct vxlan_sock *vs; vs = vxlan_find_sock(net, family, port, flags, ifindex); if (!vs) return NULL; return vxlan_vs_find_vni(vs, ifindex, vni, NULL); } /* Fill in neighbour message in skbuff. */ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, u32 portid, u32 seq, int type, unsigned int flags, const struct vxlan_rdst *rdst) { unsigned long now = jiffies; struct nda_cacheinfo ci; bool send_ip, send_eth; struct nlmsghdr *nlh; struct nexthop *nh; struct ndmsg *ndm; int nh_family; u32 nh_id; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); memset(ndm, 0, sizeof(*ndm)); send_eth = send_ip = true; rcu_read_lock(); nh = rcu_dereference(fdb->nh); if (nh) { nh_family = nexthop_get_family(nh); nh_id = nh->id; } rcu_read_unlock(); if (type == RTM_GETNEIGH) { if (rdst) { send_ip = !vxlan_addr_any(&rdst->remote_ip); ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else if (nh) { ndm->ndm_family = nh_family; } send_eth = !is_zero_ether_addr(fdb->key.eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; if (!net_eq(dev_net(vxlan->dev), vxlan->net) && nla_put_s32(skb, NDA_LINK_NETNSID, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) goto nla_put_failure; } else if (rdst) { if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) goto nla_put_failure; if (rdst->remote_ifindex && nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) goto nla_put_failure; } if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->key.vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->key.vni))) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - READ_ONCE(fdb->updated)); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ + nla_total_size(sizeof(struct nda_cacheinfo)); } static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type) { struct net *net = dev_net(vxlan->dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); if (err < 0) { /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, const struct vxlan_rdst *rd, struct netlink_ext_ack *extack, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { fdb_info->info.dev = vxlan->dev; fdb_info->info.extack = extack; fdb_info->remote_ip = rd->remote_ip; fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; memcpy(fdb_info->eth_addr, fdb->key.eth_addr, ETH_ALEN); fdb_info->vni = fdb->key.vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, bool adding, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info info; enum switchdev_notifier_type notifier_type; int ret; if (WARN_ON(!rd)) return 0; notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE; vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info); ret = call_switchdev_notifiers(notifier_type, vxlan->dev, &info.info, extack); return notifier_to_errno(ret); } static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type, bool swdev_notify, struct netlink_ext_ack *extack) { int err; if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, true, extack); if (err) return err; break; case RTM_DELNEIGH: vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, false, extack); break; } } __vxlan_fdb_notify(vxlan, fdb, rd, type); return 0; } static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { .remote_ip = *ipa, /* goes to NDA_DST */ .remote_vni = cpu_to_be32(VXLAN_N_VID), }; vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) { struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { }; memcpy(f.key.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } /* Look up Ethernet address in forwarding table */ static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct vxlan_fdb_key key; memset(&key, 0, sizeof(key)); memcpy(key.eth_addr, mac, sizeof(key.eth_addr)); if (!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) key.vni = vxlan->default_dst.remote_vni; else key.vni = vni; return rhashtable_lookup(&vxlan->fdb_hash_tbl, &key, vxlan_fdb_rht_params); } static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct vxlan_fdb *f; f = vxlan_find_mac_rcu(vxlan, mac, vni); if (f) { unsigned long now = jiffies; if (READ_ONCE(f->used) != now) WRITE_ONCE(f->used, now); } return f; } static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct vxlan_fdb *f; lockdep_assert_held_once(&vxlan->hash_lock); rcu_read_lock(); f = vxlan_find_mac_rcu(vxlan, mac, vni); rcu_read_unlock(); return f; } /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex) { struct vxlan_rdst *rd; list_for_each_entry(rd, &f->remotes, list) { if (vxlan_addr_equal(&rd->remote_ip, ip) && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) return rd; } return NULL; } int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); u8 eth_addr[ETH_ALEN + 2] = { 0 }; struct vxlan_rdst *rdst = NULL; struct vxlan_fdb *f; int rc = 0; if (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)) return -EINVAL; ether_addr_copy(eth_addr, mac); rcu_read_lock(); f = vxlan_find_mac_rcu(vxlan, eth_addr, vni); if (f) rdst = first_remote_rcu(f); if (!rdst) { rc = -ENOENT; goto out; } vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info); out: rcu_read_unlock(); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc); static int vxlan_fdb_notify_one(struct notifier_block *nb, const struct vxlan_dev *vxlan, const struct vxlan_fdb *f, const struct vxlan_rdst *rdst, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info fdb_info; int rc; vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info); rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE, &fdb_info); return notifier_to_errno(rc); } int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; int rc = 0; if (!netif_is_vxlan(dev)) return -EINVAL; vxlan = netdev_priv(dev); spin_lock_bh(&vxlan->hash_lock); hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { if (f->key.vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, extack); if (rc) goto unlock; } } } spin_unlock_bh(&vxlan->hash_lock); return 0; unlock: spin_unlock_bh(&vxlan->hash_lock); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); spin_lock_bh(&vxlan->hash_lock); hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { if (f->key.vni == vni) { list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; } } spin_unlock_bh(&vxlan->hash_lock); } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); /* Replace destination of unicast mac */ static int vxlan_fdb_replace(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst *oldrd) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); if (!rd) return 0; *oldrd = *rd; dst_cache_reset(&rd->dst_cache); rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; rd->offloaded = false; return 1; } /* Add/update destinations for multicast */ static int vxlan_fdb_append(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst **rdp) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOMEM; /* The driver can work correctly without a dst cache, so do not treat * dst cache initialization errors as fatal. */ dst_cache_init(&rd->dst_cache, GFP_ATOMIC | __GFP_NOWARN); rd->remote_ip = *ip; rd->remote_port = port; rd->offloaded = false; rd->remote_vni = vni; rd->remote_ifindex = ifindex; list_add_tail_rcu(&rd->list, &f->remotes); *rdp = rd; return 1; } static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol) { const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) return false; /* "The initial version is 0. If a receiver does not support the * version indicated it MUST drop the packet. */ if (gpe->version != 0) return false; /* "When the O bit is set to 1, the packet is an OAM packet and OAM * processing MUST occur." However, we don't implement OAM * processing, thus drop the packet. */ if (gpe->oam_flag) return false; *protocol = tun_p_to_eth_p(gpe->next_protocol); if (!*protocol) return false; return true; } static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, unsigned int off, struct vxlanhdr *vh, size_t hdrlen, __be32 vni_field, struct gro_remcsum *grc, bool nopartial) { size_t start, offset; if (skb->remcsum_offload) return vh; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; start = vxlan_rco_start(vni_field); offset = start + vxlan_rco_offset(vni_field); vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, start, offset, grc, nopartial); skb->remcsum_offload = 1; return vh; } static struct vxlanhdr *vxlan_gro_prepare_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb, struct gro_remcsum *grc) { struct sk_buff *p; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk); __be32 flags; skb_gro_remcsum_init(grc); off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); vh = skb_gro_header(skb, hlen, off_vx); if (unlikely(!vh)) return NULL; skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); flags = vh->vx_flags; if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), vh->vx_vni, grc, !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); if (!vh) return NULL; } skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; vh2 = (struct vxlanhdr *)(p->data + off_vx); if (vh->vx_flags != vh2->vx_flags || vh->vx_vni != vh2->vx_vni) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } return vh; } static struct sk_buff *vxlan_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct gro_remcsum grc; int flush = 1; if (vxlan_gro_prepare_receive(sk, head, skb, &grc)) { pp = call_gro_receive(eth_gro_receive, head, skb); flush = 0; } skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { const struct packet_offload *ptype; struct sk_buff *pp = NULL; struct gro_remcsum grc; struct vxlanhdr *vh; __be16 protocol; int flush = 1; vh = vxlan_gro_prepare_receive(sk, head, skb, &grc); if (vh) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto out; ptype = gro_find_receive_by_type(protocol); if (!ptype) goto out; pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; } out: skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { /* Sets 'skb->inner_mac_header' since we are always called with * 'skb->encapsulation' set. */ return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } static int vxlan_gpe_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct vxlanhdr *vh = (struct vxlanhdr *)(skb->data + nhoff); const struct packet_offload *ptype; int err = -ENOSYS; __be16 protocol; if (!vxlan_parse_gpe_proto(vh, &protocol)) return err; ptype = gro_find_complete_by_type(protocol); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); return err; } static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, __u16 state, __be32 src_vni, __u16 ndm_flags) { struct vxlan_fdb *f; f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; memset(&f->key, 0, sizeof(f->key)); f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; f->key.vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->key.eth_addr, mac, ETH_ALEN); return f; } static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { struct nexthop *old_nh = rtnl_dereference(fdb->nh); struct nexthop *nh; int err = -EINVAL; if (old_nh && old_nh->id == nhid) return 0; nh = nexthop_find_by_id(vxlan->net, nhid); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); nh = NULL; goto err_inval; } if (!nexthop_is_fdb(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); goto err_inval; } if (!nexthop_is_multipath(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); goto err_inval; } /* check nexthop group family */ switch (vxlan->default_dst.remote_ip.sa.sa_family) { case AF_INET: if (!nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } break; case AF_INET6: if (nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } } if (old_nh) { list_del_rcu(&fdb->nh_list); nexthop_put(old_nh); } rcu_assign_pointer(fdb->nh, nh); list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); return 1; err_inval: if (nh) nexthop_put(nh); return err; } int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, struct vxlan_fdb **fdb, struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int rc; if (vxlan->cfg.addrmax && vxlan->addrcnt >= vxlan->cfg.addrmax) return -ENOSPC; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); if (!f) return -ENOMEM; if (nhid) rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); else rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) goto errout; rc = rhashtable_lookup_insert_fast(&vxlan->fdb_hash_tbl, &f->rhnode, vxlan_fdb_rht_params); if (rc) goto destroy_remote; ++vxlan->addrcnt; hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); *fdb = f; return 0; destroy_remote: if (rcu_access_pointer(f->nh)) { list_del_rcu(&f->nh_list); nexthop_put(rtnl_dereference(f->nh)); } else { list_del(&rd->list); dst_cache_destroy(&rd->dst_cache); kfree(rd); } errout: kfree(f); return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; struct nexthop *nh; nh = rcu_dereference_raw(f->nh); if (nh) { rcu_assign_pointer(f->nh, NULL); rcu_assign_pointer(f->vdev, NULL); nexthop_put(nh); } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); kfree(rd); } kfree(f); } static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); __vxlan_fdb_free(f); } static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, bool do_notify, bool swdev_notify) { struct vxlan_rdst *rd; netdev_dbg(vxlan->dev, "delete %pM\n", f->key.eth_addr); --vxlan->addrcnt; if (do_notify) { if (rcu_access_pointer(f->nh)) vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); else list_for_each_entry(rd, &f->remotes, list) vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); } hlist_del_init_rcu(&f->fdb_node); rhashtable_remove_fast(&vxlan->fdb_hash_tbl, &f->rhnode, vxlan_fdb_rht_params); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } static void vxlan_dst_free(struct rcu_head *head) { struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); dst_cache_destroy(&rd->dst_cache); kfree(rd); } static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_rdst *rd = NULL; struct vxlan_rdst oldrd; int notify = 0; int rc = 0; int err; if (nhid && !rcu_access_pointer(f->nh)) { NL_SET_ERR_MSG(extack, "Cannot replace an existing non nexthop fdb with a nexthop"); return -EOPNOTSUPP; } if (nhid && (flags & NLM_F_APPEND)) { NL_SET_ERR_MSG(extack, "Cannot append to a nexthop fdb"); return -EOPNOTSUPP; } /* Do not allow an externally learned entry to take over an entry added * by the user. */ if (!(fdb_flags & NTF_EXT_LEARNED) || !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; notify = 1; } } if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ if (!(is_multicast_ether_addr(f->key.eth_addr) || is_zero_ether_addr(f->key.eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) return rc; } else { rc = vxlan_fdb_replace(f, ip, port, vni, ifindex, &oldrd); } notify |= rc; } else { NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } if ((flags & NLM_F_APPEND) && (is_multicast_ether_addr(f->key.eth_addr) || is_zero_ether_addr(f->key.eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) return rc; notify |= rc; } if (ndm_flags & NTF_USE) WRITE_ONCE(f->updated, jiffies); if (notify) { if (rd == NULL) rd = first_remote_rtnl(f); WRITE_ONCE(f->updated, jiffies); err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify, extack); if (err) goto err_notify; } return 0; err_notify: if (nhid) return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { list_del_rcu(&rd->list); call_rcu(&rd->rcu, vxlan_dst_free); } return err; } static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_fdb *f; int rc; /* Disallow replace to add a multicast entry */ if ((flags & NLM_F_REPLACE) && (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) return -EOPNOTSUPP; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) goto err_notify; return 0; err_notify: vxlan_fdb_destroy(vxlan, f, false, false); return rc; } /* Add new entry to forwarding table -- assumes lock held */ int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { struct vxlan_fdb *f; f = vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, "lost race to create %pM\n", mac); return -EEXIST; } return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, ndm_flags, nhid, swdev_notify, extack); } } static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, struct vxlan_rdst *rd, bool swdev_notify) { list_del_rcu(&rd->list); vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); call_rcu(&rd->rcu, vxlan_dst_free); } static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, __be32 *vni, u32 *ifindex, u32 *nhid, struct netlink_ext_ack *extack) { struct net *net = dev_net(vxlan->dev); int err; if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] || tb[NDA_PORT])) { NL_SET_ERR_MSG(extack, "DST, VNI, ifindex and port are mutually exclusive with NH_ID"); return -EINVAL; } if (tb[NDA_DST]) { err = vxlan_nla_get_addr(ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG(extack, "Unsupported address family"); return err; } } else { union vxlan_addr *remote = &vxlan->default_dst.remote_ip; if (remote->sa.sa_family == AF_INET) { ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); ip->sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { ip->sin6.sin6_addr = in6addr_any; ip->sa.sa_family = AF_INET6; #endif } } if (tb[NDA_PORT]) { if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) { NL_SET_ERR_MSG(extack, "Invalid vxlan port"); return -EINVAL; } *port = nla_get_be16(tb[NDA_PORT]); } else { *port = vxlan->cfg.dst_port; } if (tb[NDA_VNI]) { if (nla_len(tb[NDA_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid vni"); return -EINVAL; } *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); } else { *vni = vxlan->default_dst.remote_vni; } if (tb[NDA_SRC_VNI]) { if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid src vni"); return -EINVAL; } *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); } else { *src_vni = vxlan->default_dst.remote_vni; } if (tb[NDA_IFINDEX]) { struct net_device *tdev; if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } *ifindex = nla_get_u32(tb[NDA_IFINDEX]); tdev = __dev_get_by_index(net, *ifindex); if (!tdev) { NL_SET_ERR_MSG(extack, "Device not found"); return -EADDRNOTAVAIL; } } else { *ifindex = 0; } *nhid = nla_get_u32_default(tb[NDA_NH_ID], 0); return 0; } /* Add static entry (via netlink) */ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ union vxlan_addr ip; __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { pr_info("RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; return err; } int __vxlan_fdb_delete(struct vxlan_dev *vxlan, const unsigned char *addr, union vxlan_addr ip, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; if (!vxlan_addr_any(&ip)) { rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); if (!rd) goto out; } /* remove a destination if it's not the only one on the list, * otherwise destroy the fdb entry */ if (rd && !list_is_singular(&f->remotes)) { vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify); goto out; } vxlan_fdb_destroy(vxlan, f, true, swdev_notify); out: return 0; } /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; __be16 port; int err; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; spin_lock_bh(&vxlan->hash_lock); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; return err; } /* Dump forwarding table */ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; int err = 0; rcu_read_lock(); hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { if (*idx < ctx->fdb_idx) goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, NULL); if (err < 0) { rcu_read_unlock(); goto out; } skip_nh: *idx += 1; continue; } list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < ctx->fdb_idx) goto skip; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, rd); if (err < 0) { rcu_read_unlock(); goto out; } skip: *idx += 1; } } rcu_read_unlock(); out: return err; } static int vxlan_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; __be32 vni; int err; if (tb[NDA_VNI]) vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); else vni = vxlan->default_dst.remote_vni; rcu_read_lock(); f = vxlan_find_mac_rcu(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; goto errout; } err = vxlan_fdb_info(skb, vxlan, f, portid, seq, RTM_NEWNEIGH, 0, first_remote_rcu(f)); errout: rcu_read_unlock(); return err; } /* Watch incoming packets to learn mapping between Ethernet address * and Tunnel endpoint. */ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, union vxlan_addr *src_ip, const u8 *src_mac, u32 src_ifindex, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 ifindex = 0; /* Ignore packets from invalid src-address */ if (!is_valid_ether_addr(src_mac)) return SKB_DROP_REASON_MAC_INVALID_SOURCE; #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) ifindex = src_ifindex; #endif f = vxlan_find_mac_rcu(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); unsigned long now = jiffies; if (READ_ONCE(f->updated) != now) WRITE_ONCE(f->updated, now); /* Don't override an fdb with nexthop with a learnt entry */ if (rcu_access_pointer(f->nh)) return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) return SKB_NOT_DROPPED_YET; /* Don't migrate static entries, drop packets */ if (f->state & (NUD_PERMANENT | NUD_NOARP)) return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { /* learned new entry */ spin_lock(&vxlan->hash_lock); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) vxlan_fdb_update(vxlan, src_mac, src_ip, NUD_REACHABLE, NLM_F_EXCL|NLM_F_CREATE, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock); } return SKB_NOT_DROPPED_YET; } static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) { ASSERT_RTNL(); if (!vs) return false; if (!refcount_dec_and_test(&vs->refcnt)) return false; hlist_del_rcu(&vs->hlist); udp_tunnel_notify_del_rx_port(vs->sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); return true; } static void vxlan_sock_release(struct vxlan_dev *vxlan) { struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); RCU_INIT_POINTER(vxlan->vn6_sock, NULL); #endif RCU_INIT_POINTER(vxlan->vn4_sock, NULL); synchronize_net(); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vs_del_vnigrp(vxlan); else vxlan_vs_del_dev(vxlan); if (__vxlan_sock_release_prep(sock4)) { udp_tunnel_sock_release(sock4->sock); kfree(sock4); } #if IS_ENABLED(CONFIG_IPV6) if (__vxlan_sock_release_prep(sock6)) { udp_tunnel_sock_release(sock6->sock); kfree(sock6); } #endif } static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags) { const struct vxlanhdr *vh = vxlan_hdr(skb); enum skb_drop_reason reason; size_t start, offset; if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) return SKB_NOT_DROPPED_YET; start = vxlan_rco_start(vh->vx_vni); offset = start + vxlan_rco_offset(vh->vx_vni); reason = pskb_may_pull_reason(skb, offset + sizeof(u16)); if (reason) return reason; skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); return SKB_NOT_DROPPED_YET; } static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { const struct vxlanhdr *vh = vxlan_hdr(skb); const struct vxlanhdr_gbp *gbp; struct metadata_dst *tun_dst; gbp = (const struct vxlanhdr_gbp *)vh; if (!(vh->vx_flags & VXLAN_HF_GBP)) return; md->gbp = ntohs(gbp->policy_id); tun_dst = (struct metadata_dst *)skb_dst(skb); if (tun_dst) { __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_dst->u.tun_info.key.tun_flags); tun_dst->u.tun_info.options_len = sizeof(*md); } if (gbp->dont_learn) md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) md->gbp |= VXLAN_GBP_POLICY_APPLIED; /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; } static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, struct vxlan_sock *vs, struct sk_buff *skb, __be32 vni) { union vxlan_addr saddr; u32 ifindex = skb->dev->ifindex; skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, vxlan->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) return SKB_DROP_REASON_LOCAL_MAC; /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET6; #endif } if (!(vxlan->cfg.flags & VXLAN_F_LEARN)) return SKB_NOT_DROPPED_YET; return vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni); } static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, struct sk_buff *skb) { int err = 0; if (vxlan_get_sk_family(vs) == AF_INET) err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err) && log_ecn_error) { if (vxlan_get_sk_family(vs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } return err <= 1; } static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); enum skb_drop_reason reason; bool raw_proto = false; void *oiph; __be32 vni = 0; int nh; /* Need UDP and VXLAN header to be present */ reason = pskb_may_pull_reason(skb, VXLAN_HLEN); if (reason) goto drop; vh = vxlan_hdr(skb); /* VNI flag always required to be set */ if (!(vh->vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", ntohl(vh->vx_flags), ntohl(vh->vx_vni)); reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; } vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; vni = vxlan_vni(vh->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) { reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND; goto drop; } if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags || vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) { /* If the header uses bits besides those enabled by the * netdevice configuration, treat this as a malformed packet. * This behavior diverges from VXLAN RFC (RFC7348) which * stipulates that bits in reserved in reserved fields are to be * ignored. The approach here maintains compatibility with * previous stack code, and also is more robust and provides a * little more security in adding extensions to VXLAN. */ reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; DEV_STATS_INC(vxlan->dev, rx_frame_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; raw_proto = true; } if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, !net_eq(vxlan->net, dev_net(vxlan->dev)))) { reason = SKB_DROP_REASON_NOMEM; goto drop; } if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { reason = vxlan_remcsum(skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; } if (vxlan_collect_metadata(vs)) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst; __set_bit(IP_TUNNEL_KEY_BIT, flags); tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags, key32_to_tunnel_id(vni), sizeof(*md)); if (!tun_dst) { reason = SKB_DROP_REASON_NOMEM; goto drop; } md = ip_tunnel_info_opts(&tun_dst->u.tun_info); skb_dst_set(skb, (struct dst_entry *)tun_dst); } else { memset(md, 0, sizeof(*md)); } if (vxlan->cfg.flags & VXLAN_F_GBP) vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ if (!raw_proto) { reason = vxlan_set_mac(vxlan, vs, skb, vni); if (reason) goto drop; } else { skb_reset_mac_header(skb); skb->dev = vxlan->dev; skb->pkt_type = PACKET_HOST; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); reason = pskb_inet_may_pull_reason(skb); if (reason) { DEV_STATS_INC(vxlan->dev, rx_length_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { reason = SKB_DROP_REASON_IP_TUNNEL_ECN; DEV_STATS_INC(vxlan->dev, rx_frame_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } rcu_read_lock(); if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); dev_dstats_rx_dropped(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); reason = SKB_DROP_REASON_DEV_READY; goto drop; } dev_dstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); rcu_read_unlock(); return 0; drop: reason = reason ?: SKB_DROP_REASON_NOT_SPECIFIED; /* Consume bad packet */ kfree_skb_reason(skb, reason); return 0; } static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) { struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr *hdr; __be32 vni; if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN)) return -EINVAL; hdr = vxlan_hdr(skb); if (!(hdr->vx_flags & VXLAN_HF_VNI)) return -EINVAL; vs = rcu_dereference_sk_user_data(sk); if (!vs) return -ENOENT; vni = vxlan_vni(hdr->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL); if (!vxlan) return -ENOENT; return 0; } static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct arphdr *parp; u8 *arpptr, *sha; __be32 sip, tip; struct neighbour *n; if (dev->flags & IFF_NOARP) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); goto out; } parp = arp_hdr(skb); if ((parp->ar_hrd != htons(ARPHRD_ETHER) && parp->ar_hrd != htons(ARPHRD_IEEE802)) || parp->ar_pro != htons(ETH_P_IP) || parp->ar_op != htons(ARPOP_REQUEST) || parp->ar_hln != dev->addr_len || parp->ar_pln != 4) goto out; arpptr = (u8 *)parp + sizeof(struct arphdr); sha = arpptr; arpptr += dev->addr_len; /* sha */ memcpy(&sip, arpptr, sizeof(sip)); arpptr += sizeof(sip); arpptr += dev->addr_len; /* tha */ memcpy(&tip, arpptr, sizeof(tip)); if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) goto out; n = neigh_lookup(&arp_tbl, &tip, dev); if (n) { struct vxlan_rdst *rdst = NULL; struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } rcu_read_lock(); f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f) rdst = first_remote_rcu(f); if (rdst && vxlan_addr_any(&rdst->remote_ip)) { /* bridge-local neighbor */ neigh_release(n); rcu_read_unlock(); goto out; } rcu_read_unlock(); reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); neigh_release(n); if (reply == NULL) goto out; skb_reset_mac_header(reply); __skb_pull(reply, skb_network_offset(reply)); reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = tip, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); } out: consume_skb(skb); return NETDEV_TX_OK; } #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *vxlan_na_create(struct sk_buff *request, struct neighbour *n, bool isrouter) { struct net_device *dev = request->dev; struct sk_buff *reply; struct nd_msg *ns, *na; struct ipv6hdr *pip6; u8 *daddr; int na_olen = 8; /* opt hdr + ETH_ALEN for target */ int ns_olen; int i, len; if (dev == NULL || !pskb_may_pull(request, request->len)) return NULL; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + sizeof(*na) + na_olen + dev->needed_tailroom; reply = alloc_skb(len, GFP_ATOMIC); if (reply == NULL) return NULL; reply->protocol = htons(ETH_P_IPV6); reply->dev = dev; skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); skb_push(reply, sizeof(struct ethhdr)); skb_reset_mac_header(reply); ns = (struct nd_msg *)(ipv6_hdr(request) + 1); daddr = eth_hdr(request)->h_source; ns_olen = request->len - skb_network_offset(request) - sizeof(struct ipv6hdr) - sizeof(*ns); for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { if (!ns->opt[i + 1]) { kfree_skb(reply); return NULL; } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; } } /* Ethernet header */ ether_addr_copy(eth_hdr(reply)->h_dest, daddr); ether_addr_copy(eth_hdr(reply)->h_source, n->ha); eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); reply->protocol = htons(ETH_P_IPV6); skb_pull(reply, sizeof(struct ethhdr)); skb_reset_network_header(reply); skb_put(reply, sizeof(struct ipv6hdr)); /* IPv6 header */ pip6 = ipv6_hdr(reply); memset(pip6, 0, sizeof(struct ipv6hdr)); pip6->version = 6; pip6->priority = ipv6_hdr(request)->priority; pip6->nexthdr = IPPROTO_ICMPV6; pip6->hop_limit = 255; pip6->daddr = ipv6_hdr(request)->saddr; pip6->saddr = *(struct in6_addr *)n->primary_key; skb_pull(reply, sizeof(struct ipv6hdr)); skb_reset_transport_header(reply); /* Neighbor Advertisement */ na = skb_put_zero(reply, sizeof(*na) + na_olen); na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; na->icmph.icmp6_router = isrouter; na->icmph.icmp6_override = 1; na->icmph.icmp6_solicited = 1; na->target = ns->target; ether_addr_copy(&na->opt[2], n->ha); na->opt[0] = ND_OPT_TARGET_LL_ADDR; na->opt[1] = na_olen >> 3; na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, csum_partial(na, sizeof(*na)+na_olen, 0)); pip6->payload_len = htons(sizeof(*na)+na_olen); skb_push(reply, sizeof(struct ipv6hdr)); reply->ip_summed = CHECKSUM_UNNECESSARY; return reply; } static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); const struct in6_addr *daddr; const struct ipv6hdr *iphdr; struct inet6_dev *in6_dev; struct neighbour *n; struct nd_msg *msg; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) goto out; iphdr = ipv6_hdr(skb); daddr = &iphdr->daddr; msg = (struct nd_msg *)(iphdr + 1); if (ipv6_addr_loopback(daddr) || ipv6_addr_is_multicast(&msg->target)) goto out; n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); if (n) { struct vxlan_rdst *rdst = NULL; struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f) rdst = first_remote_rcu(f); if (rdst && vxlan_addr_any(&rdst->remote_ip)) { /* bridge-local neighbor */ neigh_release(n); goto out; } reply = vxlan_na_create(skb, n, !!(f ? f->flags & NTF_ROUTER : 0)); neigh_release(n); if (reply == NULL) goto out; if (netif_rx(reply) == NET_RX_DROP) { dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin6.sin6_addr = msg->target, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); } out: rcu_read_unlock(); consume_skb(skb); return NETDEV_TX_OK; } #endif static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct neighbour *n; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) return false; n = NULL; switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: { struct iphdr *pip; if (!pskb_may_pull(skb, sizeof(struct iphdr))) return false; pip = ip_hdr(skb); n = neigh_lookup(&arp_tbl, &pip->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = pip->daddr, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: { struct ipv6hdr *pip6; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return false; pip6 = ipv6_hdr(skb); n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin6.sin6_addr = pip6->daddr, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #endif default: return false; } if (n) { bool diff; diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); if (diff) { memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, dev->addr_len); memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); } neigh_release(n); return diff; } return false; } static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol) { struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; gpe->np_applied = 1; gpe->next_protocol = tun_p_from_eth_p(protocol); if (!gpe->next_protocol) return -EPFNOSUPPORT; return 0; } static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, int iphdr_len, __be32 vni, struct vxlan_metadata *md, u32 vxflags, bool udp_sum) { struct vxlanhdr *vxh; int min_headroom; int err; int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 inner_protocol = htons(ETH_P_TEB); if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { int csum_start = skb_checksum_start_offset(skb); if (csum_start <= VXLAN_MAX_REMCSUM_START && !(csum_start & VXLAN_RCO_SHIFT_MASK) && (skb->csum_offset == offsetof(struct udphdr, check) || skb->csum_offset == offsetof(struct tcphdr, check))) type |= SKB_GSO_TUNNEL_REMCSUM; } min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + VXLAN_HLEN + iphdr_len; /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); if (unlikely(err)) return err; err = iptunnel_handle_offloads(skb, type); if (err) return err; vxh = __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = VXLAN_HF_VNI; vxh->vx_vni = vxlan_vni_field(vni); if (type & SKB_GSO_TUNNEL_REMCSUM) { unsigned int start; start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); vxh->vx_flags |= VXLAN_HF_RCO; if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; skb->encapsulation = 0; } } if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, md); if (vxflags & VXLAN_F_GPE) { err = vxlan_build_gpe_hdr(vxh, skb->protocol); if (err < 0) return err; inner_protocol = skb->protocol; } skb_set_inner_protocol(skb, inner_protocol); return 0; } /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan, __be32 vni, bool snoop) { union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; unsigned int len = skb->len; struct net_device *dev; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; __skb_pull(skb, skb_network_offset(skb)); if (remote_ip->sa.sa_family == AF_INET) { loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); loopback.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { loopback.sin6.sin6_addr = in6addr_loopback; loopback.sa.sa_family = AF_INET6; #endif } rcu_read_lock(); dev = skb->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); goto drop; } if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); dev_dstats_tx_add(src_vxlan->dev, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { dev_dstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } rcu_read_unlock(); } static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan, int addr_family, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, u32 rt_flags) { #if IS_ENABLED(CONFIG_IPV6) /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry. */ BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL); #endif /* Bypass encapsulation if the destination is local */ if (rt_flags & RTCF_LOCAL && !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) { struct vxlan_dev *dst_vxlan; dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, addr_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND); return -ENOENT; } vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; } return 0; } void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc) { struct dst_cache *dst_cache; struct ip_tunnel_info *info; struct ip_tunnel_key *pkey; struct ip_tunnel_key key; struct vxlan_dev *vxlan = netdev_priv(dev); const struct iphdr *old_iph; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; unsigned int pkt_len = skb->len; __be16 src_port = 0, dst_port; struct dst_entry *ndst = NULL; int addr_family; __u8 tos, ttl; int ifindex; int err; u32 flags = vxlan->cfg.flags; bool use_cache; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); enum skb_drop_reason reason; bool no_eth_encap; __be32 vni = 0; no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB); reason = skb_vlan_inet_prepare(skb, no_eth_encap); if (reason) goto drop; reason = SKB_DROP_REASON_NOT_SPECIFIED; old_iph = ip_hdr(skb); info = skb_tunnel_info(skb); use_cache = ip_tunnel_dst_cache_usable(skb, info); if (rdst) { memset(&key, 0, sizeof(key)); pkey = &key; if (vxlan_addr_any(&rdst->remote_ip)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan, default_vni, true); return; } goto drop; } addr_family = vxlan->cfg.saddr.sa.sa_family; dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = (rdst->remote_vni) ? : default_vni; ifindex = rdst->remote_ifindex; if (addr_family == AF_INET) { key.u.ipv4.src = vxlan->cfg.saddr.sin.sin_addr.s_addr; key.u.ipv4.dst = rdst->remote_ip.sin.sin_addr.s_addr; } else { key.u.ipv6.src = vxlan->cfg.saddr.sin6.sin6_addr; key.u.ipv6.dst = rdst->remote_ip.sin6.sin6_addr; } dst_cache = &rdst->dst_cache; md->gbp = skb->mark; if (flags & VXLAN_F_TTL_INHERIT) { ttl = ip_tunnel_get_ttl(old_iph, skb); } else { ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(&rdst->remote_ip)) ttl = 1; } tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); if (tos && !info) use_cache = false; if (addr_family == AF_INET) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); #if IS_ENABLED(CONFIG_IPV6) switch (vxlan->cfg.label_policy) { case VXLAN_LABEL_FIXED: key.label = vxlan->cfg.label; break; case VXLAN_LABEL_INHERIT: key.label = ip_tunnel_get_flowlabel(old_iph, skb); break; default: DEBUG_NET_WARN_ON_ONCE(1); goto drop; } #endif } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", dev->name); goto drop; } pkey = &info->key; addr_family = ip_tunnel_info_af(info); dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { if (info->options_len < sizeof(*md)) goto drop; md = ip_tunnel_info_opts(info); } ttl = info->key.ttl; tos = info->key.tos; udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); rcu_read_lock(); if (addr_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); u16 ipcb_flags = 0; struct rtable *rt; __be16 df = 0; __be32 saddr; if (!ifindex) ifindex = sock4->sock->sk->sk_bound_dev_if; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(rt)) { err = PTR_ERR(rt); reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } if (flags & VXLAN_F_MC_ROUTE) ipcb_flags |= IPSKB_MCROUTE; if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, dst_port, ifindex, vni, &rt->dst, rt->rt_flags); if (err) goto out_unlock; if (vxlan->cfg.df == VXLAN_DF_SET) { df = htons(IP_DF); } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) { struct ethhdr *eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_IPV6 || (ntohs(eth->h_proto) == ETH_P_IP && old_iph->frag_off & htons(IP_DF))) df = htons(IP_DF); } } else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) { df = htons(IP_DF); } ndst = &rt->dst; err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv4.src = pkey->u.ipv4.dst; unclone->key.u.ipv4.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); if (err < 0) { reason = SKB_DROP_REASON_NOMEM; goto tx_error; } udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, src_port, dst_port, xnet, !udp_sum, ipcb_flags); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct in6_addr saddr; u16 ip6cb_flags = 0; if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); ndst = NULL; reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } if (flags & VXLAN_F_MC_ROUTE) ip6cb_flags |= IP6SKB_MCROUTE; if (!info) { u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6, dst_port, ifindex, vni, ndst, rt6i_flags); if (err) goto out_unlock; } err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv6.src = pkey->u.ipv6.dst; unclone->key.u.ipv6.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); if (err < 0) { reason = SKB_DROP_REASON_NOMEM; goto tx_error; } udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum, ip6cb_flags); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); out_unlock: rcu_read_unlock(); return; drop: dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, reason); return; tx_error: rcu_read_unlock(); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); dst_release(ndst); DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb_reason(skb, reason); } static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, struct vxlan_fdb *f, __be32 vni, bool did_rsc) { struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); nh = rcu_dereference(f->nh); if (!nh) goto drop; do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); else goto drop; return; drop: dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); } static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, u32 nhid, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); rcu_read_lock(); nh = nexthop_find_by_id(dev_net(dev), nhid); if (unlikely(!nh || !nexthop_is_fdb(nh) || !nexthop_is_multipath(nh))) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); if (vxlan->cfg.saddr.sa.sa_family != nh_rdst.remote_ip.sa.sa_family) goto drop; if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, false); else goto drop; return NETDEV_TX_OK; drop: dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return NETDEV_TX_OK; } /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. * Outer UDP destination is the VXLAN assigned port. * source port is based on hash of flow */ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst, *fdst = NULL; const struct ip_tunnel_info *info; struct vxlan_fdb *f; struct ethhdr *eth; __be32 vni = 0; u32 nhid = 0; bool did_rsc; info = skb_tunnel_info(skb); skb_reset_mac_header(skb); if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { if (info && info->mode & IP_TUNNEL_INFO_BRIDGE && info->mode & IP_TUNNEL_INFO_TX) { vni = tunnel_id_to_key32(info->key.tun_id); nhid = info->key.nhid; } else { if (info && info->mode & IP_TUNNEL_INFO_TX) vxlan_xmit_one(skb, dev, vni, NULL, false); else kfree_skb_reason(skb, SKB_DROP_REASON_TUNNEL_TXINFO); return NETDEV_TX_OK; } } if (vxlan->cfg.flags & VXLAN_F_PROXY) { eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_ARP) return arp_reduce(dev, skb, vni); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(eth->h_proto) == ETH_P_IPV6 && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1); if (m->icmph.icmp6_code == 0 && m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) return neigh_reduce(dev, skb, vni); } #endif } if (nhid) return vxlan_xmit_nhid(skb, dev, nhid, vni); if (vxlan->cfg.flags & VXLAN_F_MDB) { struct vxlan_mdb_entry *mdb_entry; rcu_read_lock(); mdb_entry = vxlan_mdb_entry_skb_get(vxlan, skb, vni); if (mdb_entry) { netdev_tx_t ret; ret = vxlan_mdb_xmit(vxlan, mdb_entry, skb); rcu_read_unlock(); return ret; } rcu_read_unlock(); } eth = eth_hdr(skb); rcu_read_lock(); f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && (ntohs(eth->h_proto) == ETH_P_IP || ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); } if (f == NULL) { f = vxlan_find_mac_tx(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); goto out; } } if (rcu_access_pointer(f->nh)) { vxlan_xmit_nh(skb, dev, f, (vni ? : vxlan->default_dst.remote_vni), did_rsc); } else { list_for_each_entry_rcu(rdst, &f->remotes, list) { struct sk_buff *skb1; if (!fdst) { fdst = rdst; continue; } skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } out: rcu_read_unlock(); return NETDEV_TX_OK; } /* Walk the forwarding table and purge stale entries */ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = timer_container_of(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; struct vxlan_fdb *f; if (!netif_running(vxlan->dev)) return; rcu_read_lock(); hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { unsigned long timeout; if (f->state & (NUD_PERMANENT | NUD_NOARP)) continue; if (f->flags & NTF_EXT_LEARNED) continue; timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { spin_lock(&vxlan->hash_lock); if (!hlist_unhashed(&f->fdb_node)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", f->key.eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); } spin_unlock(&vxlan->hash_lock); } else if (time_before(timeout, next_timer)) { next_timer = timeout; } } rcu_read_unlock(); mod_timer(&vxlan->age_timer, next_timer); } static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) { ASSERT_RTNL(); hlist_del_init_rcu(&vxlan->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&vxlan->hlist6.hlist); #endif } static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan, struct vxlan_dev_node *node) { __be32 vni = vxlan->default_dst.remote_vni; ASSERT_RTNL(); node->vxlan = vxlan; hlist_add_head_rcu(&node->hlist, vni_head(vs, vni)); } /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int err; err = rhashtable_init(&vxlan->fdb_hash_tbl, &vxlan_fdb_rht_params); if (err) return err; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnigroup_init(vxlan); if (err) goto err_rhashtable_destroy; } err = gro_cells_init(&vxlan->gro_cells, dev); if (err) goto err_vnigroup_uninit; err = vxlan_mdb_init(vxlan); if (err) goto err_gro_cells_destroy; netdev_lockdep_set_classes(dev); return 0; err_gro_cells_destroy: gro_cells_destroy(&vxlan->gro_cells); err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); err_rhashtable_destroy: rhashtable_destroy(&vxlan->fdb_hash_tbl); return err; } static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); vxlan_mdb_fini(vxlan); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); rhashtable_destroy(&vxlan->fdb_hash_tbl); } /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int ret; ret = vxlan_sock_add(vxlan); if (ret < 0) return ret; ret = vxlan_multicast_join(vxlan); if (ret) { vxlan_sock_release(vxlan); return ret; } if (vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); return ret; } struct vxlan_fdb_flush_desc { bool ignore_default_entry; unsigned long state; unsigned long state_mask; unsigned long flags; unsigned long flags_mask; __be32 src_vni; u32 nhid; __be32 vni; __be16 port; union vxlan_addr dst_ip; }; static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan) { return is_zero_ether_addr(f->key.eth_addr) && f->key.vni == vxlan->cfg.vni; } static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) { struct nexthop *nh = rtnl_dereference(f->nh); return nh && nh->id == nhid; } static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { if (desc->state_mask && (f->state & desc->state_mask) != desc->state) return false; if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) return false; if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) return false; if (desc->src_vni && f->key.vni != desc->src_vni) return false; if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) return false; return true; } static bool vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc *desc) { return desc->vni || desc->port || desc->dst_ip.sa.sa_family; } static bool vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc *desc, const struct vxlan_rdst *rd) { if (desc->vni && rd->remote_vni != desc->vni) return false; if (desc->port && rd->remote_port != desc->port) return false; if (desc->dst_ip.sa.sa_family && !vxlan_addr_equal(&rd->remote_ip, &desc->dst_ip)) return false; return true; } static void vxlan_fdb_flush_match_remotes(struct vxlan_fdb *f, struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc, bool *p_destroy_fdb) { bool remotes_flushed = false; struct vxlan_rdst *rd, *tmp; list_for_each_entry_safe(rd, tmp, &f->remotes, list) { if (!vxlan_fdb_flush_remote_matches(desc, rd)) continue; vxlan_fdb_dst_destroy(vxlan, f, rd, true); remotes_flushed = true; } *p_destroy_fdb = remotes_flushed && list_empty(&f->remotes); } /* Purge the forwarding table */ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); struct vxlan_fdb *f; rcu_read_lock(); hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { if (!vxlan_fdb_flush_matches(f, vxlan, desc)) continue; spin_lock_bh(&vxlan->hash_lock); if (hlist_unhashed(&f->fdb_node)) goto unlock; if (match_remotes) { bool destroy_fdb = false; vxlan_fdb_flush_match_remotes(f, vxlan, desc, &destroy_fdb); if (!destroy_fdb) goto unlock; } vxlan_fdb_destroy(vxlan, f, true, true); unlock: spin_unlock_bh(&vxlan->hash_lock); } rcu_read_unlock(); } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { [NDA_SRC_VNI] = { .type = NLA_U32 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, }; #define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \ NTF_ROUTER) static int vxlan_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = {}; struct ndmsg *ndm = nlmsg_data(nlh); struct nlattr *tb[NDA_MAX + 1]; u8 ndm_flags; int err; ndm_flags = ndm->ndm_flags & ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, vxlan_del_bulk_policy, extack); if (err) return err; if (ndm_flags & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); return -EINVAL; } if (ndm->ndm_state & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); return -EINVAL; } desc.state = ndm->ndm_state; desc.flags = ndm_flags; if (tb[NDA_NDM_STATE_MASK]) desc.state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); if (tb[NDA_NDM_FLAGS_MASK]) desc.flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); if (tb[NDA_SRC_VNI]) desc.src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); if (tb[NDA_NH_ID]) desc.nhid = nla_get_u32(tb[NDA_NH_ID]); if (tb[NDA_VNI]) desc.vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); if (tb[NDA_PORT]) desc.port = nla_get_be16(tb[NDA_PORT]); if (tb[NDA_DST]) { union vxlan_addr ip; err = vxlan_nla_get_addr(&ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG_ATTR(extack, tb[NDA_DST], "Unsupported address family"); return err; } desc.dst_ip = ip; } vxlan_flush(vxlan, &desc); return 0; } /* Cleanup timer and forwarding table on shutdown */ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { /* Default entry is deleted at vxlan_dellink. */ .ignore_default_entry = true, .state = 0, .state_mask = NUD_PERMANENT | NUD_NOARP, }; vxlan_multicast_leave(vxlan); timer_delete_sync(&vxlan->age_timer); vxlan_flush(vxlan, &desc); vxlan_sock_release(vxlan); return 0; } /* Stub, nothing needs to be done. */ static void vxlan_set_multicast_list(struct net_device *dev) { } static int vxlan_change_mtu(struct net_device *dev, int new_mtu) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); /* This check is different than dev->max_mtu, because it looks at * the lowerdev->mtu, rather than the static dev->max_mtu */ if (lowerdev) { int max_mtu = lowerdev->mtu - vxlan_headroom(vxlan->cfg.flags); if (new_mtu > max_mtu) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct ip_tunnel_info *info = skb_tunnel_info(skb); __be16 sport, dport; sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); dport = info->key.tp_dst ? : vxlan->cfg.dst_port; if (ip_tunnel_info_af(info) == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; if (!sock4) return -EIO; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, 0, &info->key.u.ipv4.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); } else { #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; if (!sock6) return -EIO; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, 0, &info->key.u.ipv6.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); #else /* !CONFIG_IPV6 */ return -EPFNOSUPPORT; #endif } info->key.tp_src = sport; info->key.tp_dst = dport; return 0; } static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_set_rx_mode = vxlan_set_multicast_list, .ndo_change_mtu = vxlan_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fdb_add = vxlan_fdb_add, .ndo_fdb_del = vxlan_fdb_delete, .ndo_fdb_del_bulk = vxlan_fdb_delete_bulk, .ndo_fdb_dump = vxlan_fdb_dump, .ndo_fdb_get = vxlan_fdb_get, .ndo_mdb_add = vxlan_mdb_add, .ndo_mdb_del = vxlan_mdb_del, .ndo_mdb_del_bulk = vxlan_mdb_del_bulk, .ndo_mdb_dump = vxlan_mdb_dump, .ndo_mdb_get = vxlan_mdb_get, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; static const struct net_device_ops vxlan_netdev_raw_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_change_mtu = vxlan_change_mtu, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type vxlan_type = { .name = "vxlan", }; /* Calls the ndo_udp_tunnel_add of the caller in order to * supply the listening VXLAN udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) { struct vxlan_sock *vs; struct net *net = dev_net(dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int i; ASSERT_RTNL(); for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry(vs, &vn->sock_list[i], hlist) { unsigned short type; if (vs->flags & VXLAN_F_GPE) type = UDP_TUNNEL_TYPE_VXLAN_GPE; else type = UDP_TUNNEL_TYPE_VXLAN; if (push) udp_tunnel_push_rx_port(dev, vs->sock, type); else udp_tunnel_drop_rx_port(dev, vs->sock, type); } } } /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); eth_hw_addr_random(dev); ether_setup(dev); dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &vxlan_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->vlan_features = dev->features; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->change_proto_down = true; dev->lltx = true; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; INIT_HLIST_HEAD(&vxlan->fdb_list); } static void vxlan_ether_setup(struct net_device *dev) { dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->netdev_ops = &vxlan_netdev_ether_ops; } static void vxlan_raw_setup(struct net_device *dev) { dev->header_ops = NULL; dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->netdev_ops = &vxlan_netdev_raw_ops; } static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, [IFLA_VXLAN_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), [IFLA_VXLAN_MC_ROUTE] = NLA_POLICY_MAX(NLA_U8, 1), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "MTU must be between 68 and 65535"); return -EINVAL; } } if (!data) { NL_SET_ERR_MSG(extack, "Required attributes not provided to perform the operation"); return -EINVAL; } if (data[IFLA_VXLAN_ID]) { u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); if (id >= VXLAN_N_VID) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID], "VXLAN ID must be lower than 16777216"); return -ERANGE; } } if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE], "Invalid source port range"); return -EINVAL; } } if (data[IFLA_VXLAN_DF]) { enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]); if (df < 0 || df > VXLAN_DF_MAX) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF], "Invalid DF attribute"); return -EINVAL; } } return 0; } static void vxlan_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); strscpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); } static int vxlan_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); if (!lowerdev) { cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; cmd->base.speed = SPEED_UNKNOWN; return 0; } return __ethtool_get_link_ksettings(lowerdev, cmd); } static const struct ethtool_ops vxlan_ethtool_ops = { .get_drvinfo = vxlan_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = vxlan_get_link_ksettings, }; static struct socket *vxlan_create_sock(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct socket *sock; struct udp_port_cfg udp_conf; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6) { udp_conf.family = AF_INET6; udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); udp_conf.ipv6_v6only = 1; } else { udp_conf.family = AF_INET; } udp_conf.local_udp_port = port; udp_conf.bind_ifindex = ifindex; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct vxlan_sock *vs; struct socket *sock; unsigned int h; struct udp_tunnel_sock_cfg tunnel_cfg; ASSERT_RTNL(); vs = kzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) return ERR_PTR(-ENOMEM); for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); sock = vxlan_create_sock(net, ipv6, port, flags, ifindex); if (IS_ERR(sock)) { kfree(vs); return ERR_CAST(sock); } vs->sock = sock; refcount_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); udp_tunnel_notify_add_rx_port(sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); /* Mark socket as an encapsulation socket. */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_rcv; tunnel_cfg.encap_err_lookup = vxlan_err_lookup; tunnel_cfg.encap_destroy = NULL; if (vs->flags & VXLAN_F_GPE) { tunnel_cfg.gro_receive = vxlan_gpe_gro_receive; tunnel_cfg.gro_complete = vxlan_gpe_gro_complete; } else { tunnel_cfg.gro_receive = vxlan_gro_receive; tunnel_cfg.gro_complete = vxlan_gro_complete; } setup_udp_tunnel_sock(net, sock, &tunnel_cfg); return vs; } static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; struct vxlan_sock *vs = NULL; struct vxlan_dev_node *node; int l3mdev_index = 0; ASSERT_RTNL(); if (vxlan->cfg.remote_ifindex) l3mdev_index = l3mdev_master_upper_ifindex_by_index( vxlan->net, vxlan->cfg.remote_ifindex); if (!vxlan->cfg.no_share) { rcu_read_lock(); vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (vs && !refcount_inc_not_zero(&vs->refcnt)) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); } if (!vs) vs = vxlan_socket_create(vxlan->net, ipv6, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (IS_ERR(vs)) return PTR_ERR(vs); #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(vxlan->vn6_sock, vs); node = &vxlan->hlist6; } else #endif { rcu_assign_pointer(vxlan->vn4_sock, vs); node = &vxlan->hlist4; } if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER)) vxlan_vs_add_vnigrp(vxlan, vs, ipv6); else vxlan_vs_add_dev(vs, vxlan, node); return 0; } static int vxlan_sock_add(struct vxlan_dev *vxlan) { bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata; bool ipv4 = !ipv6 || metadata; int ret = 0; RCU_INIT_POINTER(vxlan->vn4_sock, NULL); #if IS_ENABLED(CONFIG_IPV6) RCU_INIT_POINTER(vxlan->vn6_sock, NULL); if (ipv6) { ret = __vxlan_sock_add(vxlan, true); if (ret < 0 && ret != -EAFNOSUPPORT) ipv4 = false; } #endif if (ipv4) ret = __vxlan_sock_add(vxlan, false); if (ret < 0) vxlan_sock_release(vxlan); return ret; } int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, struct vxlan_config *conf, __be32 vni) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *tmp; list_for_each_entry(tmp, &vn->vxlan_list, next) { if (tmp == vxlan) continue; if (tmp->cfg.flags & VXLAN_F_VNIFILTER) { if (!vxlan_vnifilter_lookup(tmp, vni)) continue; } else if (tmp->cfg.vni != vni) { continue; } if (tmp->cfg.dst_port != conf->dst_port) continue; if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) continue; if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && tmp->cfg.remote_ifindex != conf->remote_ifindex) continue; return -EEXIST; } return 0; } static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, struct net_device **lower, struct vxlan_dev *old, struct netlink_ext_ack *extack) { bool use_ipv6 = false; if (conf->flags & VXLAN_F_GPE) { /* For now, allow GPE only together with * COLLECT_METADATA. This can be relaxed later; in such * case, the other side of the PtP link will have to be * provided. */ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG(extack, "VXLAN GPE does not support this combination of attributes"); return -EINVAL; } } if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) { /* Unless IPv6 is explicitly requested, assume IPv4 */ conf->remote_ip.sa.sa_family = AF_INET; conf->saddr.sa.sa_family = AF_INET; } else if (!conf->remote_ip.sa.sa_family) { conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family; } else if (!conf->saddr.sa.sa_family) { conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; } if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) { NL_SET_ERR_MSG(extack, "Local and remote address must be from the same family"); return -EINVAL; } if (vxlan_addr_multicast(&conf->saddr)) { NL_SET_ERR_MSG(extack, "Local address cannot be multicast"); return -EINVAL; } if (conf->saddr.sa.sa_family == AF_INET6) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG(extack, "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } use_ipv6 = true; conf->flags |= VXLAN_F_IPV6; if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) { int local_type = ipv6_addr_type(&conf->saddr.sin6.sin6_addr); int remote_type = ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr); if (local_type & IPV6_ADDR_LINKLOCAL) { if (!(remote_type & IPV6_ADDR_LINKLOCAL) && (remote_type != IPV6_ADDR_ANY)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags |= VXLAN_F_IPV6_LINKLOCAL; } else { if (remote_type == (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL; } } } if (conf->label && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label attribute only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->label_policy && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label policy only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->remote_ifindex) { struct net_device *lowerdev; lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); if (!lowerdev) { NL_SET_ERR_MSG(extack, "Invalid local interface, device not found"); return -ENODEV; } #if IS_ENABLED(CONFIG_IPV6) if (use_ipv6) { struct inet6_dev *idev = __in6_dev_get(lowerdev); if (idev && idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 support disabled by administrator"); return -EPERM; } } #endif *lower = lowerdev; } else { if (vxlan_addr_multicast(&conf->remote_ip)) { NL_SET_ERR_MSG(extack, "Local interface required for multicast remote destination"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) { NL_SET_ERR_MSG(extack, "Local interface required for link-local local/remote addresses"); return -EINVAL; } #endif *lower = NULL; } if (!conf->dst_port) { if (conf->flags & VXLAN_F_GPE) conf->dst_port = htons(IANA_VXLAN_GPE_UDP_PORT); else conf->dst_port = htons(vxlan_port); } if (!conf->age_interval) conf->age_interval = FDB_AGE_DEFAULT; if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) { NL_SET_ERR_MSG(extack, "A VXLAN device with the specified VNI already exists"); return -EEXIST; } return 0; } static void vxlan_config_apply(struct net_device *dev, struct vxlan_config *conf, struct net_device *lowerdev, struct net *src_net, bool changelink) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; unsigned short needed_headroom = ETH_HLEN; int max_mtu = ETH_MAX_MTU; u32 flags = conf->flags; if (!changelink) { if (flags & VXLAN_F_GPE) vxlan_raw_setup(dev); else vxlan_ether_setup(dev); if (conf->mtu) dev->mtu = conf->mtu; vxlan->net = src_net; } dst->remote_vni = conf->vni; memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); if (lowerdev) { dst->remote_ifindex = conf->remote_ifindex; netif_inherit_tso_max(dev, lowerdev); needed_headroom = lowerdev->hard_header_len; needed_headroom += lowerdev->needed_headroom; dev->needed_tailroom = lowerdev->needed_tailroom; max_mtu = lowerdev->mtu - vxlan_headroom(flags); if (max_mtu < ETH_MIN_MTU) max_mtu = ETH_MIN_MTU; if (!changelink && !conf->mtu) dev->mtu = max_mtu; } if (dev->mtu > max_mtu) dev->mtu = max_mtu; if (flags & VXLAN_F_COLLECT_METADATA) flags |= VXLAN_F_IPV6; needed_headroom += vxlan_headroom(flags); dev->needed_headroom = needed_headroom; memcpy(&vxlan->cfg, conf, sizeof(*conf)); } static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_config *conf, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *lowerdev; int ret; ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack); if (ret) return ret; vxlan_config_apply(dev, conf, lowerdev, src_net, false); return 0; } static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_config *conf, struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_dev_configure(net, dev, conf, extack); if (err) return err; dev->ethtool_ops = &vxlan_ethtool_ops; err = register_netdevice(dev); if (err) return err; if (dst->remote_ifindex) { remote_dev = __dev_get_by_index(net, dst->remote_ifindex); if (!remote_dev) { err = -ENODEV; goto unregister; } err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) goto unregister; dst->remote_dev = remote_dev; } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, NLM_F_EXCL | NLM_F_CREATE, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); spin_unlock_bh(&vxlan->hash_lock); if (err) goto unlink; } list_add(&vxlan->next, &vn->vxlan_list); return 0; unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); unregister: unregister_netdevice(dev); return err; } /* Set/clear flags based on attribute */ static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[], int attrtype, unsigned long mask, bool changelink, bool changelink_supported, struct netlink_ext_ack *extack) { unsigned long flags; if (!tb[attrtype]) return 0; if (changelink && !changelink_supported) { vxlan_flag_attr_error(attrtype, extack); return -EOPNOTSUPP; } if (vxlan_policy[attrtype].type == NLA_FLAG) flags = conf->flags | mask; else if (nla_get_u8(tb[attrtype])) flags = conf->flags | mask; else flags = conf->flags & ~mask; conf->flags = flags; return 0; } static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { struct vxlanhdr used_bits = { .vx_flags = VXLAN_HF_VNI, .vx_vni = VXLAN_VNI_MASK, }; struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; memset(conf, 0, sizeof(*conf)); /* if changelink operation, start with old existing cfg */ if (changelink) memcpy(conf, &vxlan->cfg, sizeof(*conf)); if (data[IFLA_VXLAN_ID]) { __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); if (changelink && (vni != conf->vni)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI"); return -EOPNOTSUPP; } conf->vni = vni; } if (data[IFLA_VXLAN_GROUP]) { if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); conf->remote_ip.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_GROUP6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); conf->remote_ip.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LOCAL]) { if (changelink && (conf->saddr.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old"); return -EOPNOTSUPP; } conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); conf->saddr.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_LOCAL6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old"); return -EOPNOTSUPP; } /* TODO: respect scope id */ conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); conf->saddr.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LINK]) conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); if (data[IFLA_VXLAN_TOS]) conf->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); if (data[IFLA_VXLAN_TTL]) conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); if (data[IFLA_VXLAN_TTL_INHERIT]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT, VXLAN_F_TTL_INHERIT, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LABEL]) conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & IPV6_FLOWLABEL_MASK; if (data[IFLA_VXLAN_LABEL_POLICY]) conf->label_policy = nla_get_u32(data[IFLA_VXLAN_LABEL_POLICY]); if (data[IFLA_VXLAN_LEARNING]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING, VXLAN_F_LEARN, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to learn on a new device */ conf->flags |= VXLAN_F_LEARN; } if (data[IFLA_VXLAN_AGEING]) conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); if (data[IFLA_VXLAN_PROXY]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY, VXLAN_F_PROXY, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_RSC]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC, VXLAN_F_RSC, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L2MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS, VXLAN_F_L2MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L3MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS, VXLAN_F_L3MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LIMIT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT], "Cannot change limit"); return -EOPNOTSUPP; } conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); } if (data[IFLA_VXLAN_COLLECT_METADATA]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA, VXLAN_F_COLLECT_METADATA, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_PORT_RANGE]) { if (!changelink) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); conf->port_min = ntohs(p->low); conf->port_max = ntohs(p->high); } else { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], "Cannot change port range"); return -EOPNOTSUPP; } } if (data[IFLA_VXLAN_PORT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT], "Cannot change port"); return -EOPNOTSUPP; } conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); } if (data[IFLA_VXLAN_UDP_CSUM]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM], "Cannot change UDP_CSUM flag"); return -EOPNOTSUPP; } if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; } if (data[IFLA_VXLAN_LOCALBYPASS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, VXLAN_F_LOCALBYPASS, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to local bypass on a new device */ conf->flags |= VXLAN_F_LOCALBYPASS; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, VXLAN_F_UDP_ZERO_CSUM6_RX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX, VXLAN_F_REMCSUM_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX, VXLAN_F_REMCSUM_RX, changelink, false, extack); if (err) return err; used_bits.vx_flags |= VXLAN_HF_RCO; used_bits.vx_vni |= ~VXLAN_VNI_MASK; } if (data[IFLA_VXLAN_GBP]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP, VXLAN_F_GBP, changelink, false, extack); if (err) return err; used_bits.vx_flags |= VXLAN_GBP_USED_BITS; } if (data[IFLA_VXLAN_GPE]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE, VXLAN_F_GPE, changelink, false, extack); if (err) return err; used_bits.vx_flags |= VXLAN_GPE_USED_BITS; } if (data[IFLA_VXLAN_RESERVED_BITS]) { struct vxlanhdr reserved_bits; if (changelink) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_RESERVED_BITS], "Cannot change reserved_bits"); return -EOPNOTSUPP; } nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS], sizeof(reserved_bits)); if (used_bits.vx_flags & reserved_bits.vx_flags || used_bits.vx_vni & reserved_bits.vx_vni) { __be64 ub_be64, rb_be64; memcpy(&ub_be64, &used_bits, sizeof(ub_be64)); memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64)); NL_SET_ERR_MSG_ATTR_FMT(extack, data[IFLA_VXLAN_RESERVED_BITS], "Used bits %#018llx cannot overlap reserved bits %#018llx", be64_to_cpu(ub_be64), be64_to_cpu(rb_be64)); return -EINVAL; } conf->reserved_bits = reserved_bits; } else { /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ conf->reserved_bits = (struct vxlanhdr) { .vx_flags = ~used_bits.vx_flags, .vx_vni = ~used_bits.vx_vni, }; } if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_MC_ROUTE]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_MC_ROUTE, VXLAN_F_MC_ROUTE, changelink, true, extack); if (err) return err; } if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "Cannot change mtu"); return -EOPNOTSUPP; } conf->mtu = nla_get_u32(tb[IFLA_MTU]); } if (data[IFLA_VXLAN_DF]) conf->df = nla_get_u8(data[IFLA_VXLAN_DF]); if (data[IFLA_VXLAN_VNIFILTER]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER, VXLAN_F_VNIFILTER, changelink, false, extack); if (err) return err; if ((conf->flags & VXLAN_F_VNIFILTER) && !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER], "vxlan vnifilter only valid in collect metadata mode"); return -EINVAL; } } return 0; } static int vxlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct vxlan_config conf; int err; err = vxlan_nl2conf(tb, data, dev, &conf, false, extack); if (err) return err; return __vxlan_dev_create(link_net, dev, &conf, extack); } static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); bool rem_ip_changed, change_igmp; struct net_device *lowerdev; struct vxlan_config conf; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_nl2conf(tb, data, dev, &conf, true, extack); if (err) return err; err = vxlan_config_validate(vxlan->net, &conf, &lowerdev, vxlan, extack); if (err) return err; if (dst->remote_dev == lowerdev) lowerdev = NULL; err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev, extack); if (err) return err; rem_ip_changed = !vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip); change_igmp = vxlan->dev->flags & IFF_UP && (rem_ip_changed || dst->remote_ifindex != conf.remote_ifindex); /* handle default dst entry */ if (rem_ip_changed) { spin_lock_bh(&vxlan->hash_lock); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, NUD_REACHABLE | NUD_PERMANENT, NLM_F_APPEND | NLM_F_CREATE, vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } if (!vxlan_addr_any(&dst->remote_ip)) __vxlan_fdb_delete(vxlan, all_zeros_mac, dst->remote_ip, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, true); spin_unlock_bh(&vxlan->hash_lock); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip */ if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip, &conf.remote_ip, extack); if (err) { netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } } if (change_igmp && vxlan_addr_multicast(&dst->remote_ip)) err = vxlan_multicast_leave(vxlan); if (conf.age_interval != vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies); netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev); if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); if (!err && change_igmp && vxlan_addr_multicast(&dst->remote_ip)) err = vxlan_multicast_join(vxlan); return err; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = {}; vxlan_flush(vxlan, &desc); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); if (vxlan->default_dst.remote_dev) netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev); } static size_t vxlan_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LABEL_POLICY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ /* IFLA_VXLAN_PORT_RANGE */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + nla_total_size(0) + /* IFLA_VXLAN_GBP */ nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ /* IFLA_VXLAN_RESERVED_BITS */ nla_total_size(sizeof(struct vxlanhdr)) + 0; } static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { const struct vxlan_dev *vxlan = netdev_priv(dev); const struct vxlan_rdst *dst = &vxlan->default_dst; struct ifla_vxlan_port_range ports = { .low = htons(vxlan->cfg.port_min), .high = htons(vxlan->cfg.port_max), }; if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) goto nla_put_failure; if (!vxlan_addr_any(&dst->remote_ip)) { if (dst->remote_ip.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, dst->remote_ip.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, &dst->remote_ip.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; if (!vxlan_addr_any(&vxlan->cfg.saddr)) { if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, vxlan->cfg.saddr.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, &vxlan->cfg.saddr.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || nla_put_u32(skb, IFLA_VXLAN_LABEL_POLICY, vxlan->cfg.label_policy) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || nla_put_u8(skb, IFLA_VXLAN_PROXY, !!(vxlan->cfg.flags & VXLAN_F_PROXY)) || nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->cfg.flags & VXLAN_F_RSC)) || nla_put_u8(skb, IFLA_VXLAN_L2MISS, !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) || nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GBP && nla_put_flag(skb, IFLA_VXLAN_GBP)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GPE && nla_put_flag(skb, IFLA_VXLAN_GPE)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL && nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER && nla_put_u8(skb, IFLA_VXLAN_VNIFILTER, !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS, sizeof(vxlan->cfg.reserved_bits), &vxlan->cfg.reserved_bits)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vxlan_get_link_net(const struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); return READ_ONCE(vxlan->net); } static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .kind = "vxlan", .maxtype = IFLA_VXLAN_MAX, .policy = vxlan_policy, .priv_size = sizeof(struct vxlan_dev), .setup = vxlan_setup, .validate = vxlan_validate, .newlink = vxlan_newlink, .changelink = vxlan_changelink, .dellink = vxlan_dellink, .get_size = vxlan_get_size, .fill_info = vxlan_fill_info, .get_link_net = vxlan_get_link_net, }; struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; int err; memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &vxlan_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; err = __vxlan_dev_create(net, dev, conf, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) { LIST_HEAD(list_kill); vxlan_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } return dev; } EXPORT_SYMBOL_GPL(vxlan_dev_create); static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, struct net_device *dev) { struct vxlan_dev *vxlan, *next; LIST_HEAD(list_kill); list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { struct vxlan_rdst *dst = &vxlan->default_dst; /* In case we created vxlan device with carrier * and we loose the carrier due to module unload * we also need to remove vxlan device. In other * cases, it's not necessary and remote_ifindex * is 0 here, so no matches. */ if (dst->remote_ifindex == dev->ifindex) vxlan_dellink(vxlan->dev, &list_kill); } unregister_netdevice_many(&list_kill); } static int vxlan_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); if (event == NETDEV_UNREGISTER) vxlan_handle_lowerdev_unregister(vn, dev); else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) vxlan_offload_rx_ports(dev, true); else if (event == NETDEV_UDP_TUNNEL_DROP_INFO) vxlan_offload_rx_ports(dev, false); return NOTIFY_DONE; } static struct notifier_block vxlan_notifier_block __read_mostly = { .notifier_call = vxlan_netdevice_event, }; static void vxlan_fdb_offloaded_set(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; spin_lock_bh(&vxlan->hash_lock); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip, fdb_info->remote_port, fdb_info->remote_vni, fdb_info->remote_ifindex); if (!rdst) goto out; rdst->offloaded = fdb_info->offloaded; out: spin_unlock_bh(&vxlan->hash_lock); } static int vxlan_fdb_external_learn_add(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; int err; extack = switchdev_notifier_info_to_extack(&fdb_info->info); spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); spin_unlock_bh(&vxlan->hash_lock); return err; } static int vxlan_fdb_external_learn_del(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; int err = 0; spin_lock_bh(&vxlan->hash_lock); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr, fdb_info->remote_ip, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, false); spin_unlock_bh(&vxlan->hash_lock); return err; } static int vxlan_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_vxlan_fdb_info *fdb_info; int err = 0; switch (event) { case SWITCHDEV_VXLAN_FDB_OFFLOADED: vxlan_fdb_offloaded_set(dev, ptr); break; case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_add(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = true; vxlan_fdb_offloaded_set(dev, fdb_info); break; case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_del(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = false; vxlan_fdb_offloaded_set(dev, fdb_info); break; } return err; } static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = { .notifier_call = vxlan_switchdev_event, }; static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); spin_lock_bh(&vxlan->hash_lock); if (!hlist_unhashed(&fdb->fdb_node)) vxlan_fdb_destroy(vxlan, fdb, false, false); spin_unlock_bh(&vxlan->hash_lock); } rcu_read_unlock(); } static int vxlan_nexthop_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct nh_notifier_info *info = ptr; struct nexthop *nh; if (event != NEXTHOP_EVENT_DEL) return NOTIFY_DONE; nh = nexthop_find_by_id(info->net, info->id); if (!nh) return NOTIFY_DONE; vxlan_fdb_nh_flush(nh); return NOTIFY_DONE; } static __net_init int vxlan_init_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event; for (h = 0; h < PORT_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->sock_list[h]); return register_nexthop_notifier(net, &vn->nexthop_notifier_block, NULL); } static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn, struct list_head *dev_to_kill) { struct vxlan_dev *vxlan, *next; list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) vxlan_dellink(vxlan->dev, dev_to_kill); } static void __net_exit vxlan_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); ASSERT_RTNL_NET(net); __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); vxlan_destroy_tunnels(vn, dev_to_kill); } static void __net_exit vxlan_exit_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int h; for (h = 0; h < PORT_HASH_SIZE; ++h) WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, .exit_rtnl = vxlan_exit_rtnl, .exit = vxlan_exit_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), }; static int __init vxlan_init_module(void) { int rc; rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; rc = register_netdevice_notifier(&vxlan_notifier_block); if (rc) goto out2; rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block); if (rc) goto out3; rc = rtnl_link_register(&vxlan_link_ops); if (rc) goto out4; rc = vxlan_vnifilter_init(); if (rc) goto out5; return 0; out5: rtnl_link_unregister(&vxlan_link_ops); out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); out3: unregister_netdevice_notifier(&vxlan_notifier_block); out2: unregister_pernet_subsys(&vxlan_net_ops); out1: return rc; } late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { vxlan_vnifilter_uninit(); rtnl_link_unregister(&vxlan_link_ops); unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); unregister_netdevice_notifier(&vxlan_notifier_block); unregister_pernet_subsys(&vxlan_net_ops); /* rcu_barrier() is called by netns */ } module_exit(vxlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(VXLAN_VERSION); MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("vxlan");
13 9 4 3 2 1 12 1 1 1 1 11 11 11 11 23 23 23 23 21 1 21 2 21 16 15 6 2 33 32 31 30 29 8 21 25 24 24 19 9 33 3 2 1 3 3 2 1 3 8 8 7 2 7 1 7 7 8 8 5 3 3 3 5 5 13 8 5 12 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 3 2 2 1 1 3 1 1 1 1 1 1 28 27 27 26 25 24 23 20 20 2 18 19 1 1 19 1 1 19 1 21 4 21 1 21 1 21 1 21 1 21 20 17 17 26 28 2 1 1 1 1 2 8 5 1 5 3 5 1 5 1 5 9 26 26 26 26 26 26 26 26 16 26 1 26 1 26 26 26 26 1 26 2 26 26 26 26 26 26 26 26 26 26 26 26 26 26 2 1 1 1 2 4 4 4 4 4 3 3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 // SPDX-License-Identifier: GPL-2.0-only /* L2TP netlink layer, for management * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * * Partly based on the IrDA nelink implementation * (see net/irda/irnetlink.c) which is: * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org> * which is in turn partly based on the wireless netlink code: * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sock.h> #include <net/genetlink.h> #include <net/udp.h> #include <linux/in.h> #include <linux/udp.h> #include <linux/socket.h> #include <linux/module.h> #include <linux/list.h> #include <net/net_namespace.h> #include <linux/l2tp.h> #include "l2tp_core.h" static struct genl_family l2tp_nl_family; static const struct genl_multicast_group l2tp_multicast_group[] = { { .name = L2TP_GENL_MCGROUP, }, }; static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd); static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_session *session, u8 cmd); /* Accessed under genl lock */ static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info) { u32 tunnel_id; u32 session_id; char *ifname; struct l2tp_tunnel *tunnel; struct l2tp_session *session = NULL; struct net *net = genl_info_net(info); if (info->attrs[L2TP_ATTR_IFNAME]) { ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); session = l2tp_session_get_by_ifname(net, ifname); } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && (info->attrs[L2TP_ATTR_CONN_ID])) { tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (tunnel) { session = l2tp_session_get(net, tunnel->sock, tunnel->version, tunnel_id, session_id); l2tp_tunnel_put(tunnel); } } return session; } static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *hdr; int ret = -ENOBUFS; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto out; } hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &l2tp_nl_family, 0, L2TP_CMD_NOOP); if (!hdr) { ret = -EMSGSIZE; goto err_out; } genlmsg_end(msg, hdr); return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); err_out: nlmsg_free(msg); out: return ret; } static int l2tp_tunnel_notify(struct genl_family *family, struct genl_info *info, struct l2tp_tunnel *tunnel, u8 cmd) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); if (ret >= 0) { ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; return ret; } nlmsg_free(msg); return ret; } static int l2tp_session_notify(struct genl_family *family, struct genl_info *info, struct l2tp_session *session, u8 cmd) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); if (ret >= 0) { ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; return ret; } nlmsg_free(msg); return ret; } static int l2tp_nl_cmd_tunnel_create_get_addr(struct nlattr **attrs, struct l2tp_tunnel_cfg *cfg) { if (attrs[L2TP_ATTR_UDP_SPORT]) cfg->local_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_SPORT]); if (attrs[L2TP_ATTR_UDP_DPORT]) cfg->peer_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_DPORT]); cfg->use_udp_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_CSUM]); /* Must have either AF_INET or AF_INET6 address for source and destination */ #if IS_ENABLED(CONFIG_IPV6) if (attrs[L2TP_ATTR_IP6_SADDR] && attrs[L2TP_ATTR_IP6_DADDR]) { cfg->local_ip6 = nla_data(attrs[L2TP_ATTR_IP6_SADDR]); cfg->peer_ip6 = nla_data(attrs[L2TP_ATTR_IP6_DADDR]); cfg->udp6_zero_tx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); cfg->udp6_zero_rx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); return 0; } #endif if (attrs[L2TP_ATTR_IP_SADDR] && attrs[L2TP_ATTR_IP_DADDR]) { cfg->local_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_SADDR]); cfg->peer_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_DADDR]); return 0; } return -EINVAL; } static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id; u32 peer_tunnel_id; int proto_version; int fd = -1; int ret = 0; struct l2tp_tunnel_cfg cfg = { 0, }; struct l2tp_tunnel *tunnel; struct net *net = genl_info_net(info); struct nlattr **attrs = info->attrs; if (!attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(attrs[L2TP_ATTR_CONN_ID]); if (!attrs[L2TP_ATTR_PEER_CONN_ID]) { ret = -EINVAL; goto out; } peer_tunnel_id = nla_get_u32(attrs[L2TP_ATTR_PEER_CONN_ID]); if (!attrs[L2TP_ATTR_PROTO_VERSION]) { ret = -EINVAL; goto out; } proto_version = nla_get_u8(attrs[L2TP_ATTR_PROTO_VERSION]); if (!attrs[L2TP_ATTR_ENCAP_TYPE]) { ret = -EINVAL; goto out; } cfg.encap = nla_get_u16(attrs[L2TP_ATTR_ENCAP_TYPE]); /* Managed tunnels take the tunnel socket from userspace. * Unmanaged tunnels must call out the source and destination addresses * for the kernel to create the tunnel socket itself. */ if (attrs[L2TP_ATTR_FD]) { fd = nla_get_u32(attrs[L2TP_ATTR_FD]); } else { ret = l2tp_nl_cmd_tunnel_create_get_addr(attrs, &cfg); if (ret < 0) goto out; } ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: case L2TP_ENCAPTYPE_IP: ret = l2tp_tunnel_create(fd, proto_version, tunnel_id, peer_tunnel_id, &cfg, &tunnel); break; } if (ret < 0) goto out; refcount_inc(&tunnel->ref_count); ret = l2tp_tunnel_register(tunnel, net, &cfg); if (ret < 0) { kfree(tunnel); goto out; } ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_CREATE); l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; u32 tunnel_id; int ret = 0; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_DELETE); l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; u32 tunnel_id; int ret = 0; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); l2tp_tunnel_put(tunnel); out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static int l2tp_nl_tunnel_send_addr6(struct sk_buff *skb, struct sock *sk, enum l2tp_encap_type encap) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); switch (encap) { case L2TP_ENCAPTYPE_UDP: if (udp_get_no_check6_tx(sk) && nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX)) return -1; if (udp_get_no_check6_rx(sk) && nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX)) return -1; if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) return -1; fallthrough; case L2TP_ENCAPTYPE_IP: if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, &np->saddr) || nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, &sk->sk_v6_daddr)) return -1; break; } return 0; } #endif static int l2tp_nl_tunnel_send_addr4(struct sk_buff *skb, struct sock *sk, enum l2tp_encap_type encap) { struct inet_sock *inet = inet_sk(sk); switch (encap) { case L2TP_ENCAPTYPE_UDP: if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx) || nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) return -1; fallthrough; case L2TP_ENCAPTYPE_IP: if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr) || nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr)) return -1; break; } return 0; } /* Append attributes for the tunnel address, handling the different attribute types * used for different tunnel encapsulation and AF_INET v.s. AF_INET6. */ static int l2tp_nl_tunnel_send_addr(struct sk_buff *skb, struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; if (!sk) return 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return l2tp_nl_tunnel_send_addr6(skb, sk, tunnel->encap); #endif return l2tp_nl_tunnel_send_addr4(skb, sk, tunnel->encap); } static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd) { void *hdr; struct nlattr *nest; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) || nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, atomic_long_read(&tunnel->stats.tx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, atomic_long_read(&tunnel->stats.tx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, atomic_long_read(&tunnel->stats.tx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, atomic_long_read(&tunnel->stats.rx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, atomic_long_read(&tunnel->stats.rx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&tunnel->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, atomic_long_read(&tunnel->stats.rx_cookie_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&tunnel->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&tunnel->stats.rx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, atomic_long_read(&tunnel->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); if (l2tp_nl_tunnel_send_addr(skb, tunnel)) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -1; } static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; struct sk_buff *msg; u32 tunnel_id; int ret = -ENOBUFS; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto err; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto err_nlmsg; } ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, L2TP_CMD_TUNNEL_GET); if (ret < 0) goto err_nlmsg_tunnel; l2tp_tunnel_put(tunnel); return genlmsg_unicast(net, msg, info->snd_portid); err_nlmsg_tunnel: l2tp_tunnel_put(tunnel); err_nlmsg: nlmsg_free(msg); err: return ret; } struct l2tp_nl_cb_data { unsigned long tkey; unsigned long skey; }; static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; unsigned long key = cbd->tkey; struct l2tp_tunnel *tunnel; struct net *net = sock_net(skb->sk); for (;;) { tunnel = l2tp_tunnel_get_next(net, &key); if (!tunnel) goto out; if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, tunnel, L2TP_CMD_TUNNEL_GET) < 0) { l2tp_tunnel_put(tunnel); goto out; } l2tp_tunnel_put(tunnel); key++; } out: cbd->tkey = key; return skb->len; } static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id = 0; u32 session_id; u32 peer_session_id; int ret = 0; struct l2tp_tunnel *tunnel; struct l2tp_session *session; struct l2tp_session_cfg cfg = { 0, }; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } if (!info->attrs[L2TP_ATTR_SESSION_ID]) { ret = -EINVAL; goto out_tunnel; } session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { ret = -EINVAL; goto out_tunnel; } peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PW_TYPE]) { ret = -EINVAL; goto out_tunnel; } cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]); if (cfg.pw_type >= __L2TP_PWTYPE_MAX) { ret = -EINVAL; goto out_tunnel; } /* L2TPv2 only accepts PPP pseudo-wires */ if (tunnel->version == 2 && cfg.pw_type != L2TP_PWTYPE_PPP) { ret = -EPROTONOSUPPORT; goto out_tunnel; } if (tunnel->version > 2) { if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) { cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]); if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT && cfg.l2specific_type != L2TP_L2SPECTYPE_NONE) { ret = -EINVAL; goto out_tunnel; } } else { cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT; } if (info->attrs[L2TP_ATTR_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); if (len > 8) { ret = -EINVAL; goto out_tunnel; } cfg.cookie_len = len; memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len); } if (info->attrs[L2TP_ATTR_PEER_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); if (len > 8) { ret = -EINVAL; goto out_tunnel; } cfg.peer_cookie_len = len; memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len); } if (info->attrs[L2TP_ATTR_IFNAME]) cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); } if (info->attrs[L2TP_ATTR_RECV_SEQ]) cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); if (info->attrs[L2TP_ATTR_SEND_SEQ]) cfg.send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); if (info->attrs[L2TP_ATTR_LNS_MODE]) cfg.lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); #ifdef CONFIG_MODULES if (!l2tp_nl_cmd_ops[cfg.pw_type]) { genl_unlock(); request_module("net-l2tp-type-%u", cfg.pw_type); genl_lock(); } #endif if (!l2tp_nl_cmd_ops[cfg.pw_type] || !l2tp_nl_cmd_ops[cfg.pw_type]->session_create) { ret = -EPROTONOSUPPORT; goto out_tunnel; } ret = l2tp_nl_cmd_ops[cfg.pw_type]->session_create(net, tunnel, session_id, peer_session_id, &cfg); if (ret >= 0) { session = l2tp_session_get(net, tunnel->sock, tunnel->version, tunnel_id, session_id); if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); l2tp_session_put(session); } } out_tunnel: l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *info) { int ret = 0; struct l2tp_session *session; u16 pw_type; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto out; } l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_DELETE); pw_type = session->pwtype; if (pw_type < __L2TP_PWTYPE_MAX) if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) l2tp_nl_cmd_ops[pw_type]->session_delete(session); l2tp_session_put(session); out: return ret; } static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *info) { int ret = 0; struct l2tp_session *session; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto out; } if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); if (info->attrs[L2TP_ATTR_SEND_SEQ]) { struct l2tp_tunnel *tunnel = session->tunnel; session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); } if (info->attrs[L2TP_ATTR_LNS_MODE]) session->lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_MODIFY); l2tp_session_put(session); out: return ret; } static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_session *session, u8 cmd) { void *hdr; struct nlattr *nest; struct l2tp_tunnel *tunnel = session->tunnel; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; if ((session->ifname[0] && nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) || (session->cookie_len && nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, session->cookie)) || (session->peer_cookie_len && nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, session->peer_cookie)) || nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) || nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) || nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) || (l2tp_tunnel_uses_xfrm(tunnel) && nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) || (session->reorder_timeout && nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout, L2TP_ATTR_PAD))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, atomic_long_read(&session->stats.tx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, atomic_long_read(&session->stats.tx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, atomic_long_read(&session->stats.tx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, atomic_long_read(&session->stats.rx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, atomic_long_read(&session->stats.rx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&session->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, atomic_long_read(&session->stats.rx_cookie_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&session->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&session->stats.rx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, atomic_long_read(&session->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -1; } static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) { struct l2tp_session *session; struct sk_buff *msg; int ret; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_ref; } ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, 0, session, L2TP_CMD_SESSION_GET); if (ret < 0) goto err_ref_msg; ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); l2tp_session_put(session); return ret; err_ref_msg: nlmsg_free(msg); err_ref: l2tp_session_put(session); err: return ret; } static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; struct net *net = sock_net(skb->sk); struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; unsigned long tkey = cbd->tkey; unsigned long skey = cbd->skey; for (;;) { if (!tunnel) { tunnel = l2tp_tunnel_get_next(net, &tkey); if (!tunnel) goto out; } session = l2tp_session_get_next(net, tunnel->sock, tunnel->version, tunnel->tunnel_id, &skey); if (!session) { tkey++; l2tp_tunnel_put(tunnel); tunnel = NULL; skey = 0; continue; } if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, session, L2TP_CMD_SESSION_GET) < 0) { l2tp_session_put(session); l2tp_tunnel_put(tunnel); break; } l2tp_session_put(session); skey++; } out: cbd->tkey = tkey; cbd->skey = skey; return skb->len; } static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, }, [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_OFFSET] = { .type = NLA_U16, }, [L2TP_ATTR_DATA_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_TYPE] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_LEN] = { .type = NLA_U8, }, [L2TP_ATTR_PROTO_VERSION] = { .type = NLA_U8, }, [L2TP_ATTR_CONN_ID] = { .type = NLA_U32, }, [L2TP_ATTR_PEER_CONN_ID] = { .type = NLA_U32, }, [L2TP_ATTR_SESSION_ID] = { .type = NLA_U32, }, [L2TP_ATTR_PEER_SESSION_ID] = { .type = NLA_U32, }, [L2TP_ATTR_UDP_CSUM] = { .type = NLA_U8, }, [L2TP_ATTR_VLAN_ID] = { .type = NLA_U16, }, [L2TP_ATTR_DEBUG] = { .type = NLA_U32, }, [L2TP_ATTR_RECV_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_SEND_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_LNS_MODE] = { .type = NLA_U8, }, [L2TP_ATTR_USING_IPSEC] = { .type = NLA_U8, }, [L2TP_ATTR_RECV_TIMEOUT] = { .type = NLA_MSECS, }, [L2TP_ATTR_FD] = { .type = NLA_U32, }, [L2TP_ATTR_IP_SADDR] = { .type = NLA_U32, }, [L2TP_ATTR_IP_DADDR] = { .type = NLA_U32, }, [L2TP_ATTR_UDP_SPORT] = { .type = NLA_U16, }, [L2TP_ATTR_UDP_DPORT] = { .type = NLA_U16, }, [L2TP_ATTR_MTU] = { .type = NLA_U16, }, [L2TP_ATTR_MRU] = { .type = NLA_U16, }, [L2TP_ATTR_STATS] = { .type = NLA_NESTED, }, [L2TP_ATTR_IP6_SADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [L2TP_ATTR_IP6_DADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [L2TP_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1, }, [L2TP_ATTR_COOKIE] = { .type = NLA_BINARY, .len = 8, }, [L2TP_ATTR_PEER_COOKIE] = { .type = NLA_BINARY, .len = 8, }, }; static const struct genl_small_ops l2tp_nl_ops[] = { { .cmd = L2TP_CMD_NOOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_noop, /* can be retrieved by unprivileged users */ }, { .cmd = L2TP_CMD_TUNNEL_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_create, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_delete, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_modify, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_get, .dumpit = l2tp_nl_cmd_tunnel_dump, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_create, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_delete, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_modify, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_get, .dumpit = l2tp_nl_cmd_session_dump, .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family l2tp_nl_family __ro_after_init = { .name = L2TP_GENL_NAME, .version = L2TP_GENL_VERSION, .hdrsize = 0, .maxattr = L2TP_ATTR_MAX, .policy = l2tp_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = l2tp_nl_ops, .n_small_ops = ARRAY_SIZE(l2tp_nl_ops), .resv_start_op = L2TP_CMD_SESSION_GET + 1, .mcgrps = l2tp_multicast_group, .n_mcgrps = ARRAY_SIZE(l2tp_multicast_group), }; int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops) { int ret; ret = -EINVAL; if (pw_type >= __L2TP_PWTYPE_MAX) goto err; genl_lock(); ret = -EBUSY; if (l2tp_nl_cmd_ops[pw_type]) goto out; l2tp_nl_cmd_ops[pw_type] = ops; ret = 0; out: genl_unlock(); err: return ret; } EXPORT_SYMBOL_GPL(l2tp_nl_register_ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type) { if (pw_type < __L2TP_PWTYPE_MAX) { genl_lock(); l2tp_nl_cmd_ops[pw_type] = NULL; genl_unlock(); } } EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops); static int __init l2tp_nl_init(void) { pr_info("L2TP netlink interface\n"); return genl_register_family(&l2tp_nl_family); } static void l2tp_nl_cleanup(void) { genl_unregister_family(&l2tp_nl_family); } module_init(l2tp_nl_init); module_exit(l2tp_nl_cleanup); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP netlink"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0"); MODULE_ALIAS_GENL_FAMILY("l2tp");
316 220 316 84 51 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_EXTEND_H #define _NF_CONNTRACK_EXTEND_H #include <linux/slab.h> #include <net/netfilter/nf_conntrack.h> enum nf_ct_ext_id { NF_CT_EXT_HELPER, #if IS_ENABLED(CONFIG_NF_NAT) NF_CT_EXT_NAT, #endif NF_CT_EXT_SEQADJ, NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_CT_EXT_TSTAMP, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT NF_CT_EXT_TIMEOUT, #endif #ifdef CONFIG_NF_CONNTRACK_LABELS NF_CT_EXT_LABELS, #endif #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) NF_CT_EXT_SYNPROXY, #endif #if IS_ENABLED(CONFIG_NET_ACT_CT) NF_CT_EXT_ACT_CT, #endif NF_CT_EXT_NUM, }; /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; unsigned int gen_id; char data[] __aligned(8); }; static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id) { return (ct->ext && __nf_ct_ext_exist(ct->ext, id)); } void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id); static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id) { struct nf_ct_ext *ext = ct->ext; if (!ext || !__nf_ct_ext_exist(ext, id)) return NULL; if (unlikely(ext->gen_id)) return __nf_ct_ext_find(ext, id); return (void *)ct->ext + ct->ext->offset[id]; } /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); /* ext genid. if ext->id != ext_genid, extensions cannot be used * anymore unless conntrack has CONFIRMED bit set. */ extern atomic_t nf_conntrack_ext_genid; void nf_ct_ext_bump_genid(void); #endif /* _NF_CONNTRACK_EXTEND_H */
44 12 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2009 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> */ #ifndef _LINUX_INTEGRITY_H #define _LINUX_INTEGRITY_H #include <linux/fs.h> #include <linux/iversion.h> enum integrity_status { INTEGRITY_PASS = 0, INTEGRITY_PASS_IMMUTABLE, INTEGRITY_FAIL, INTEGRITY_FAIL_IMMUTABLE, INTEGRITY_NOLABEL, INTEGRITY_NOXATTRS, INTEGRITY_UNKNOWN, }; #ifdef CONFIG_INTEGRITY extern void __init integrity_load_keys(void); #else static inline void integrity_load_keys(void) { } #endif /* CONFIG_INTEGRITY */ /* An inode's attributes for detection of changes */ struct integrity_inode_attributes { u64 version; /* track inode changes */ unsigned long ino; dev_t dev; }; /* * On stacked filesystems the i_version alone is not enough to detect file data * or metadata change. Additional metadata is required. */ static inline void integrity_inode_attrs_store(struct integrity_inode_attributes *attrs, u64 i_version, const struct inode *inode) { attrs->version = i_version; attrs->dev = inode->i_sb->s_dev; attrs->ino = inode->i_ino; } /* * On stacked filesystems detect whether the inode or its content has changed. */ static inline bool integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs, const struct inode *inode) { return (inode->i_sb->s_dev != attrs->dev || inode->i_ino != attrs->ino || !inode_eq_iversion(inode, attrs->version)); } #endif /* _LINUX_INTEGRITY_H */
10 3 3 3 3 3 3 3 3 3 3 1 1 7 7 7 6 6 6 6 6 6 9 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 /* gf128mul.c - GF(2^128) multiplication functions * * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org> * * Based on Dr Brian Gladman's (GPL'd) work published at * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php * See the original copyright notice below. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ /* --------------------------------------------------------------------------- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. LICENSE TERMS The free distribution and use of this software in both source and binary form is allowed (with or without changes) provided that: 1. distributions of this source code include the above copyright notice, this list of conditions and the following disclaimer; 2. distributions in binary form include the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other associated materials; 3. the copyright holder's name is not used to endorse products built using this software without specific written permission. ALTERNATIVELY, provided that this notice is retained in full, this product may be distributed under the terms of the GNU General Public License (GPL), in which case the provisions of the GPL apply INSTEAD OF those given above. DISCLAIMER This software is provided 'as is' with no explicit or implied warranties in respect of its properties, including, but not limited to, correctness and/or fitness for purpose. --------------------------------------------------------------------------- Issue 31/01/2006 This file provides fast multiplication in GF(2^128) as required by several cryptographic authentication modes */ #include <crypto/gf128mul.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #define gf128mul_dat(q) { \ q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\ q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\ q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\ q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\ q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\ q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\ q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\ q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\ q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\ q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\ q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\ q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\ q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\ q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\ q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\ q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\ q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\ q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\ q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\ q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\ q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\ q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\ q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\ q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\ q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\ q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\ q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\ q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\ q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\ q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\ q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\ q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \ } /* * Given a value i in 0..255 as the byte overflow when a field element * in GF(2^128) is multiplied by x^8, the following macro returns the * 16-bit value that must be XOR-ed into the low-degree end of the * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1. * * There are two versions of the macro, and hence two tables: one for * the "be" convention where the highest-order bit is the coefficient of * the highest-degree polynomial term, and one for the "le" convention * where the highest-order bit is the coefficient of the lowest-degree * polynomial term. In both cases the values are stored in CPU byte * endianness such that the coefficients are ordered consistently across * bytes, i.e. in the "be" table bits 15..0 of the stored value * correspond to the coefficients of x^15..x^0, and in the "le" table * bits 15..0 correspond to the coefficients of x^0..x^15. * * Therefore, provided that the appropriate byte endianness conversions * are done by the multiplication functions (and these must be in place * anyway to support both little endian and big endian CPUs), the "be" * table can be used for multiplications of both "bbe" and "ble" * elements, and the "le" table can be used for multiplications of both * "lle" and "lbe" elements. */ #define xda_be(i) ( \ (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \ (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \ (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \ (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \ ) #define xda_le(i) ( \ (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \ (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \ (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \ (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \ ) static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le); static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be); /* * The following functions multiply a field element by x^8 in * the polynomial field representation. They use 64-bit word operations * to gain speed but compensate for machine endianness and hence work * correctly on both styles of machine. */ static void gf128mul_x8_lle(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = gf128mul_table_le[b & 0xff]; x->b = cpu_to_be64((b >> 8) | (a << 56)); x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); } /* time invariant version of gf128mul_x8_lle */ static void gf128mul_x8_lle_ti(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = xda_le(b & 0xff); /* avoid table lookup */ x->b = cpu_to_be64((b >> 8) | (a << 56)); x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); } static void gf128mul_x8_bbe(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = gf128mul_table_be[a >> 56]; x->a = cpu_to_be64((a << 8) | (b >> 56)); x->b = cpu_to_be64((b << 8) ^ _tt); } void gf128mul_x8_ble(le128 *r, const le128 *x) { u64 a = le64_to_cpu(x->a); u64 b = le64_to_cpu(x->b); u64 _tt = gf128mul_table_be[a >> 56]; r->a = cpu_to_le64((a << 8) | (b >> 56)); r->b = cpu_to_le64((b << 8) ^ _tt); } EXPORT_SYMBOL(gf128mul_x8_ble); void gf128mul_lle(be128 *r, const be128 *b) { /* * The p array should be aligned to twice the size of its element type, * so that every even/odd pair is guaranteed to share a cacheline * (assuming a cacheline size of 32 bytes or more, which is by far the * most common). This ensures that each be128_xor() call in the loop * takes the same amount of time regardless of the value of 'ch', which * is derived from function parameter 'b', which is commonly used as a * key, e.g., for GHASH. The odd array elements are all set to zero, * making each be128_xor() a NOP if its associated bit in 'ch' is not * set, and this is equivalent to calling be128_xor() conditionally. * This approach aims to avoid leaking information about such keys * through execution time variances. * * Unfortunately, __aligned(16) or higher does not work on x86 for * variables on the stack so we need to perform the alignment by hand. */ be128 array[16 + 3] = {}; be128 *p = PTR_ALIGN(&array[0], 2 * sizeof(be128)); int i; p[0] = *r; for (i = 0; i < 7; ++i) gf128mul_x_lle(&p[2 * i + 2], &p[2 * i]); memset(r, 0, sizeof(*r)); for (i = 0;;) { u8 ch = ((u8 *)b)[15 - i]; be128_xor(r, r, &p[ 0 + !(ch & 0x80)]); be128_xor(r, r, &p[ 2 + !(ch & 0x40)]); be128_xor(r, r, &p[ 4 + !(ch & 0x20)]); be128_xor(r, r, &p[ 6 + !(ch & 0x10)]); be128_xor(r, r, &p[ 8 + !(ch & 0x08)]); be128_xor(r, r, &p[10 + !(ch & 0x04)]); be128_xor(r, r, &p[12 + !(ch & 0x02)]); be128_xor(r, r, &p[14 + !(ch & 0x01)]); if (++i >= 16) break; gf128mul_x8_lle_ti(r); /* use the time invariant version */ } } EXPORT_SYMBOL(gf128mul_lle); /* This version uses 64k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key value in GF(2^128). If we consider a GF(2^128) value in the buffer's lowest byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. But we also need tables for each of the 16 higher bytes in the buffer as well, which makes 64 kbytes in total. */ /* additional explanation * t[0][BYTE] contains g*BYTE * t[1][BYTE] contains g*x^8*BYTE * .. * t[15][BYTE] contains g*x^120*BYTE */ struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g) { struct gf128mul_64k *t; int i, j, k; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out; for (i = 0; i < 16; i++) { t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL); if (!t->t[i]) { gf128mul_free_64k(t); t = NULL; goto out; } } t->t[0]->t[1] = *g; for (j = 1; j <= 64; j <<= 1) gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]); for (i = 0;;) { for (j = 2; j < 256; j += j) for (k = 1; k < j; ++k) be128_xor(&t->t[i]->t[j + k], &t->t[i]->t[j], &t->t[i]->t[k]); if (++i >= 16) break; for (j = 128; j > 0; j >>= 1) { t->t[i]->t[j] = t->t[i - 1]->t[j]; gf128mul_x8_bbe(&t->t[i]->t[j]); } } out: return t; } EXPORT_SYMBOL(gf128mul_init_64k_bbe); void gf128mul_free_64k(struct gf128mul_64k *t) { int i; for (i = 0; i < 16; i++) kfree_sensitive(t->t[i]); kfree_sensitive(t); } EXPORT_SYMBOL(gf128mul_free_64k); void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t) { u8 *ap = (u8 *)a; be128 r[1]; int i; *r = t->t[0]->t[ap[15]]; for (i = 1; i < 16; ++i) be128_xor(r, r, &t->t[i]->t[ap[15 - i]]); *a = *r; } EXPORT_SYMBOL(gf128mul_64k_bbe); /* This version uses 4k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key value in GF(2^128). If we consider a GF(2^128) value in a single byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. If we take the highest byte in the buffer and use this table to get the result, we then have to multiply by x^120 to get the final value. For the next highest byte the result has to be multiplied by x^112 and so on. But we can do this by accumulating the result in an accumulator starting with the result for the top byte. We repeatedly multiply the accumulator value by x^8 and then add in (i.e. xor) the 16 bytes of the next lower byte in the buffer, stopping when we reach the lowest byte. This requires a 4096 byte table. */ struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g) { struct gf128mul_4k *t; int j, k; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out; t->t[128] = *g; for (j = 64; j > 0; j >>= 1) gf128mul_x_lle(&t->t[j], &t->t[j+j]); for (j = 2; j < 256; j += j) for (k = 1; k < j; ++k) be128_xor(&t->t[j + k], &t->t[j], &t->t[k]); out: return t; } EXPORT_SYMBOL(gf128mul_init_4k_lle); void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t) { u8 *ap = (u8 *)a; be128 r[1]; int i = 15; *r = t->t[ap[15]]; while (i--) { gf128mul_x8_lle(r); be128_xor(r, r, &t->t[ap[i]]); } *a = *r; } EXPORT_SYMBOL(gf128mul_4k_lle); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
4 4 3 2 1 3 4 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 // SPDX-License-Identifier: GPL-2.0-or-later /* * authencesn.c - AEAD wrapper for IPsec with extended sequence numbers, * derived from authenc.c * * Copyright (C) 2010 secunet Security Networks AG * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/authenc.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> struct authenc_esn_instance_ctx { struct crypto_ahash_spawn auth; struct crypto_skcipher_spawn enc; }; struct crypto_authenc_esn_ctx { unsigned int reqoff; struct crypto_ahash *auth; struct crypto_skcipher *enc; }; struct authenc_esn_request_ctx { struct scatterlist src[2]; struct scatterlist dst[2]; char tail[]; }; static void authenc_esn_request_complete(struct aead_request *req, int err) { if (err != -EINPROGRESS) aead_request_complete(req, err); } static int crypto_authenc_esn_setauthsize(struct crypto_aead *authenc_esn, unsigned int authsize) { if (authsize > 0 && authsize < 4) return -EINVAL; return 0; } static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 *key, unsigned int keylen) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; struct crypto_skcipher *enc = ctx->enc; struct crypto_authenc_keys keys; int err = -EINVAL; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) goto out; crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen); if (err) goto out; crypto_skcipher_clear_flags(enc, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen); out: memzero_explicit(&keys, sizeof(keys)); return err; } static int crypto_authenc_esn_genicv_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); u8 *hash = areq_ctx->tail; unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); scatterwalk_map_and_copy(hash, dst, assoclen + cryptlen, authsize, 1); return 0; } static void authenc_esn_geniv_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_genicv_tail(req, 0); aead_request_complete(req, err); } static int crypto_authenc_esn_genicv(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *hash = areq_ctx->tail; struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; if (!authsize) return 0; /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, dst, 0, 8, 0); scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, hash, assoclen + cryptlen); ahash_request_set_callback(ahreq, flags, authenc_esn_geniv_ahash_done, req); return crypto_ahash_digest(ahreq) ?: crypto_authenc_esn_genicv_tail(req, aead_request_flags(req)); } static void crypto_authenc_esn_encrypt_done(void *data, int err) { struct aead_request *areq = data; if (!err) err = crypto_authenc_esn_genicv(areq, 0); authenc_esn_request_complete(areq, err); } static int crypto_authenc_esn_encrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_skcipher *enc = ctx->enc; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *src, *dst; int err; sg_init_table(areq_ctx->src, 2); src = scatterwalk_ffwd(areq_ctx->src, req->src, assoclen); dst = src; if (req->src != req->dst) { memcpy_sglist(req->dst, req->src, assoclen); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, assoclen); } skcipher_request_set_tfm(skreq, enc); skcipher_request_set_callback(skreq, aead_request_flags(req), crypto_authenc_esn_encrypt_done, req); skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; return crypto_authenc_esn_genicv(req, aead_request_flags(req)); } static int crypto_authenc_esn_decrypt_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int cryptlen = req->cryptlen - authsize; unsigned int assoclen = req->assoclen; struct scatterlist *dst = req->dst; u8 *ihash = ohash + crypto_ahash_digestsize(auth); u32 tmp[2]; if (!authsize) goto decrypt; /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); if (crypto_memneq(ihash, ohash, authsize)) return -EBADMSG; decrypt: sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, assoclen); skcipher_request_set_tfm(skreq, ctx->enc); skcipher_request_set_callback(skreq, flags, req->base.complete, req->base.data); skcipher_request_set_crypt(skreq, dst, dst, cryptlen, req->iv); return crypto_skcipher_decrypt(skreq); } static void authenc_esn_verify_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_decrypt_tail(req, 0); authenc_esn_request_complete(req, err); } static int crypto_authenc_esn_decrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; u8 *ihash = ohash + crypto_ahash_digestsize(auth); struct scatterlist *dst = req->dst; u32 tmp[2]; int err; cryptlen -= authsize; if (req->src != dst) memcpy_sglist(dst, req->src, assoclen + cryptlen); scatterwalk_map_and_copy(ihash, req->src, assoclen + cryptlen, authsize, 0); if (!authsize) goto tail; /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, dst, 0, 8, 0); scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, ohash, assoclen + cryptlen); ahash_request_set_callback(ahreq, aead_request_flags(req), authenc_esn_verify_ahash_done, req); err = crypto_ahash_digest(ahreq); if (err) return err; tail: return crypto_authenc_esn_decrypt_tail(req, aead_request_flags(req)); } static int crypto_authenc_esn_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct authenc_esn_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *auth; struct crypto_skcipher *enc; int err; auth = crypto_spawn_ahash(&ictx->auth); if (IS_ERR(auth)) return PTR_ERR(auth); enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) goto err_free_ahash; ctx->auth = auth; ctx->enc = enc; ctx->reqoff = 2 * crypto_ahash_digestsize(auth); crypto_aead_set_reqsize( tfm, sizeof(struct authenc_esn_request_ctx) + ctx->reqoff + max_t(unsigned int, crypto_ahash_reqsize(auth) + sizeof(struct ahash_request), sizeof(struct skcipher_request) + crypto_skcipher_reqsize(enc))); return 0; err_free_ahash: crypto_free_ahash(auth); return err; } static void crypto_authenc_esn_exit_tfm(struct crypto_aead *tfm) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->auth); crypto_free_skcipher(ctx->enc); } static void crypto_authenc_esn_free(struct aead_instance *inst) { struct authenc_esn_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->enc); crypto_drop_ahash(&ctx->auth); kfree(inst); } static int crypto_authenc_esn_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct authenc_esn_instance_ctx *ctx; struct skcipher_alg_common *enc; struct hash_alg_common *auth; struct crypto_alg *auth_base; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ctx->auth, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; auth = crypto_spawn_ahash_alg(&ctx->auth); auth_base = &auth->base; err = crypto_grab_skcipher(&ctx->enc, aead_crypto_instance(inst), crypto_attr_alg_name(tb[2]), 0, mask); if (err) goto err_free_inst; enc = crypto_spawn_skcipher_alg_common(&ctx->enc); err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_name, enc->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_driver_name, enc->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = enc->base.cra_priority * 10 + auth_base->cra_priority; inst->alg.base.cra_blocksize = enc->base.cra_blocksize; inst->alg.base.cra_alignmask = enc->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_authenc_esn_ctx); inst->alg.ivsize = enc->ivsize; inst->alg.chunksize = enc->chunksize; inst->alg.maxauthsize = auth->digestsize; inst->alg.init = crypto_authenc_esn_init_tfm; inst->alg.exit = crypto_authenc_esn_exit_tfm; inst->alg.setkey = crypto_authenc_esn_setkey; inst->alg.setauthsize = crypto_authenc_esn_setauthsize; inst->alg.encrypt = crypto_authenc_esn_encrypt; inst->alg.decrypt = crypto_authenc_esn_decrypt; inst->free = crypto_authenc_esn_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_authenc_esn_free(inst); } return err; } static struct crypto_template crypto_authenc_esn_tmpl = { .name = "authencesn", .create = crypto_authenc_esn_create, .module = THIS_MODULE, }; static int __init crypto_authenc_esn_module_init(void) { return crypto_register_template(&crypto_authenc_esn_tmpl); } static void __exit crypto_authenc_esn_module_exit(void) { crypto_unregister_template(&crypto_authenc_esn_tmpl); } module_init(crypto_authenc_esn_module_init); module_exit(crypto_authenc_esn_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("AEAD wrapper for IPsec with extended sequence numbers"); MODULE_ALIAS_CRYPTO("authencesn");
1376 237 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H #include <linux/types.h> #include <linux/list.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; struct fib_info *fa_info; dscp_t fa_dscp; u8 fa_type; u8 fa_state; u8 fa_slen; u32 tb_id; s16 fa_default; u8 offload; u8 trap; u8 offload_failed; struct rcu_head rcu; }; #define FA_S_ACCESSED 0x01 /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) fa->fa_state |= FA_S_ACCESSED; } /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { int error; u8 scope; }; extern const struct fib_prop fib_props[RTN_MAX + 1]; #endif /* _FIB_LOOKUP_H */
5 5 5 5 19 18 18 18 18 18 18 1 17 18 18 19 19 19 15 14 15 15 15 15 15 15 15 15 19 19 19 18 19 19 19 20 1 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 4 15 15 15 15 15 19 6 19 20 41 41 41 40 22 21 21 3 3 2 2 2 2 3 2 1 6 5 4 2 26 21 1 19 19 48 41 21 21 20 20 19 2 1 1 2 2 20 20 20 20 20 20 20 28 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 // SPDX-License-Identifier: GPL-2.0-or-later /* AF_RXRPC sendmsg() implementation. * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/gfp.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" /* * Propose an abort to be made in the I/O thread. */ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, enum rxrpc_abort_reason why) { _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); if (!call->send_abort && !rxrpc_call_is_complete(call)) { call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; trace_rxrpc_abort_call(call, abort_code); /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); return true; } return false; } /* * Wait for a call to become connected. Interruption here doesn't cause the * call to be aborted. */ static int rxrpc_wait_to_be_connected(struct rxrpc_call *call, long *timeo) { DECLARE_WAITQUEUE(myself, current); int ret = 0; _enter("%d", call->debug_id); if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) goto no_wait; add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: case RXRPC_PREINTERRUPTIBLE: set_current_state(TASK_INTERRUPTIBLE); break; case RXRPC_UNINTERRUPTIBLE: default: set_current_state(TASK_UNINTERRUPTIBLE); break; } if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) break; if ((call->interruptibility == RXRPC_INTERRUPTIBLE || call->interruptibility == RXRPC_PREINTERRUPTIBLE) && signal_pending(current)) { ret = sock_intr_errno(*timeo); break; } *timeo = schedule_timeout(*timeo); } remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); no_wait: if (ret == 0 && rxrpc_call_is_complete(call)) ret = call->error; _leave(" = %d", ret); return ret; } /* * Return true if there's sufficient Tx queue space. */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); if (_tx_win) *_tx_win = tx_bottom; return call->send_top - tx_bottom < 256; } /* * Wait for space to appear in the Tx queue or a signal to occur. */ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (signal_pending(current)) return sock_intr_errno(*timeo); trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * Wait for space to appear in the Tx queue uninterruptibly, but with * a timeout of 2*RTT if no progress was made and a signal occurred. */ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, &tx_win)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (timeout == 0 && tx_win == tx_start && signal_pending(current)) return -EINTR; if (tx_win != tx_start) { timeout = rtt; tx_start = tx_win; } trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); timeout = schedule_timeout(timeout); } } /* * Wait for space to appear in the Tx queue uninterruptibly. */ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked */ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo, bool waitall) { DECLARE_WAITQUEUE(myself, current); int ret; _enter(",{%u,%u,%u}", call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: if (waitall) ret = rxrpc_wait_for_tx_window_waitall(rx, call); else ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); break; case RXRPC_PREINTERRUPTIBLE: case RXRPC_UNINTERRUPTIBLE: default: ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo); break; } remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); _leave(" = %d", ret); return ret; } /* * Notify the owner of the call that the transmit phase is ended and the last * packet has been queued. */ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx_t notify_end_tx) { if (notify_end_tx) notify_end_tx(&rx->sk, call, call->user_call_ID); } /* * Queue a DATA packet for transmission, set the resend timeout and send * the packet immediately. Returns the error from rxrpc_send_data_packet() * in case the caller wants to do something with it. */ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; bool poke, last = txb->flags & RXRPC_LAST_PACKET; int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); if (WARN_ON_ONCE(sq->bufs[ix])) trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); else trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); /* Add the packet to the call's output buffer */ poke = (READ_ONCE(call->tx_bottom) == call->send_top); sq->bufs[ix] = txb; /* Order send_top after the queue->next pointer and txb content. */ smp_store_release(&call->send_top, seq); if (last) { set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); call->send_queue = NULL; } if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* * Allocate a new txqueue unit and add it to the transmission queue. */ static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) { struct rxrpc_txqueue *tq; tq = kzalloc(sizeof(*tq), sk->sk_allocation); if (!tq) return -ENOMEM; tq->xmit_ts_base = KTIME_MIN; for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) tq->segment_xmit_ts[i] = UINT_MAX; if (call->send_queue) { tq->qbase = call->send_top + 1; call->send_queue->next = tq; call->send_queue = tq; } else if (WARN_ON(call->tx_queue)) { kfree(tq); return -ENOMEM; } else { /* We start at seq 1, so pretend seq 0 is hard-acked. */ tq->nr_reported_acks = 1; tq->segment_acked = 1UL; tq->qbase = 0; call->tx_qbase = 0; call->send_queue = tq; call->tx_qtail = tq; call->tx_queue = tq; } trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); return 0; } /* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx, bool *_dropped_lock) { struct rxrpc_txbuf *txb; struct sock *sk = &rx->sk; enum rxrpc_call_state state; long timeo; bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, call->cid, call->call_id, call->rx_consumed, 0, -EPROTO); return -EPROTO; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); if (ret < 0) return ret; if (call->conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = rxrpc_init_client_conn_security(call->conn); if (ret < 0) return ret; } /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); reload: txb = call->tx_pending; call->tx_pending = NULL; if (txb) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; state = rxrpc_call_state(call); ret = -ESHUTDOWN; if (state >= RXRPC_CALL_COMPLETE) goto maybe_error; ret = -EPROTO; if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && state != RXRPC_CALL_SERVER_ACK_REQUEST && state != RXRPC_CALL_SERVER_SEND_REPLY) { /* Request phase complete for this client call */ trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, call->cid, call->call_id, call->rx_consumed, 0, -EPROTO); goto maybe_error; } ret = -EMSGSIZE; if (call->tx_total_len != -1) { if (len - copied > call->tx_total_len) goto maybe_error; if (!more && len - copied != call->tx_total_len) goto maybe_error; } do { if (!txb) { size_t remain; _debug("alloc"); if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; /* See if we need to begin/extend the Tx queue. */ if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { ret = rxrpc_alloc_txqueue(sk, call); if (ret < 0) goto maybe_error; } /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. */ remain = more ? INT_MAX : msg_data_left(msg); txb = call->conn->security->alloc_txbuf(call, remain, sk->sk_allocation); if (!txb) { ret = -ENOMEM; goto maybe_error; } } _debug("append"); /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); txb->space -= copy; txb->len += copy; txb->offset += copy; copied += copy; if (call->tx_total_len != -1) call->tx_total_len -= copy; } /* check for the far side aborting the call or a network error * occurring */ if (rxrpc_call_is_complete(call)) goto call_terminated; /* add the packet to the send queue if it's now full */ if (!txb->space || (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } } while (msg_data_left(msg) > 0); success: ret = copied; if (rxrpc_call_is_complete(call) && call->error < 0) ret = call->error; out: call->tx_pending = txb; _leave(" = %d", ret); return ret; call_terminated: rxrpc_put_txbuf(txb, rxrpc_txbuf_put_send_aborted); _leave(" = %d", call->error); return call->error; maybe_error: if (copied) goto success; goto out; efault: ret = -EFAULT; goto out; wait_for_space: ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; mutex_unlock(&call->user_mutex); *_dropped_lock = true; ret = rxrpc_wait_for_tx_window(rx, call, &timeo, msg->msg_flags & MSG_WAITALL); if (ret < 0) goto maybe_error; if (call->interruptibility == RXRPC_INTERRUPTIBLE) { if (mutex_lock_interruptible(&call->user_mutex) < 0) { ret = sock_intr_errno(timeo); goto maybe_error; } } else { mutex_lock(&call->user_mutex); } *_dropped_lock = false; goto reload; } /* * extract control messages from the sendmsg() control buffer */ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) { struct cmsghdr *cmsg; bool got_user_ID = false; int len; if (msg->msg_controllen == 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; len = cmsg->cmsg_len - sizeof(struct cmsghdr); _debug("CMSG %d, %d, %d", cmsg->cmsg_level, cmsg->cmsg_type, len); if (cmsg->cmsg_level != SOL_RXRPC) continue; switch (cmsg->cmsg_type) { case RXRPC_USER_CALL_ID: if (msg->msg_flags & MSG_CMSG_COMPAT) { if (len != sizeof(u32)) return -EINVAL; p->call.user_call_ID = *(u32 *)CMSG_DATA(cmsg); } else { if (len != sizeof(unsigned long)) return -EINVAL; p->call.user_call_ID = *(unsigned long *) CMSG_DATA(cmsg); } got_user_ID = true; break; case RXRPC_ABORT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_SEND_ABORT; if (len != sizeof(p->abort_code)) return -EINVAL; p->abort_code = *(unsigned int *)CMSG_DATA(cmsg); if (p->abort_code == 0) return -EINVAL; break; case RXRPC_CHARGE_ACCEPT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_CHARGE_ACCEPT; if (len != 0) return -EINVAL; break; case RXRPC_EXCLUSIVE_CALL: p->exclusive = true; if (len != 0) return -EINVAL; break; case RXRPC_UPGRADE_SERVICE: p->upgrade = true; if (len != 0) return -EINVAL; break; case RXRPC_TX_LENGTH: if (p->call.tx_total_len != -1 || len != sizeof(__s64)) return -EINVAL; p->call.tx_total_len = *(__s64 *)CMSG_DATA(cmsg); if (p->call.tx_total_len < 0) return -EINVAL; break; case RXRPC_SET_CALL_TIMEOUT: if (len & 3 || len < 4 || len > 12) return -EINVAL; memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len); p->call.nr_timeouts = len / 4; if (p->call.timeouts.hard > INT_MAX / HZ) return -ERANGE; if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000) return -ERANGE; if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000) return -ERANGE; break; default: return -EINVAL; } } if (!got_user_ID) return -EINVAL; if (p->call.tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; _leave(" = 0"); return 0; } /* * Create a new client call for sendmsg(). * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; struct rxrpc_peer *peer; struct rxrpc_call *call; struct key *key; DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); _enter(""); if (!msg->msg_name) { release_sock(&rx->sk); return ERR_PTR(-EDESTADDRREQ); } peer = rxrpc_lookup_peer(rx->local, srx, GFP_KERNEL); if (!peer) { release_sock(&rx->sk); return ERR_PTR(-ENOMEM); } key = rx->key; if (key && !rx->key->payload.data[0]) key = NULL; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.peer = peer; cp.key = rx->key; cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, &p->call, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ rxrpc_put_peer(peer, rxrpc_peer_put_application); _leave(" = %p\n", call); return call; } /* * send a message forming part of a client call through an RxRPC socket * - caller holds the socket locked * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) { struct rxrpc_call *call; bool dropped_lock = false; int ret; struct rxrpc_send_params p = { .call.tx_total_len = -1, .call.user_call_ID = 0, .call.nr_timeouts = 0, .call.interruptibility = RXRPC_INTERRUPTIBLE, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, .upgrade = false, }; _enter(""); ret = rxrpc_sendmsg_cmsg(msg, &p); if (ret < 0) goto error_release_sock; if (p.command == RXRPC_CMD_CHARGE_ACCEPT) { ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) goto error_release_sock; ret = rxrpc_user_charge_accept(rx, p.call.user_call_ID); goto error_release_sock; } call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID); if (!call) { ret = -EBADSLT; if (p.command != RXRPC_CMD_SEND_DATA) goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, &p); /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); /* ... and we have the call lock. */ p.call.nr_timeouts = 0; ret = 0; if (rxrpc_call_is_complete(call)) goto out_put_unlock; } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; case RXRPC_CALL_UNINITIALISED: case RXRPC_CALL_SERVER_PREALLOC: rxrpc_put_call(call, rxrpc_call_put_sendmsg); ret = -EBUSY; goto error_release_sock; default: break; } ret = mutex_lock_interruptible(&call->user_mutex); release_sock(&rx->sk); if (ret < 0) { ret = -ERESTARTSYS; goto error_put; } if (p.call.tx_total_len != -1) { ret = -EINVAL; if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } switch (p.call.nr_timeouts) { case 3: WRITE_ONCE(call->next_rx_timo, p.call.timeouts.normal); fallthrough; case 2: WRITE_ONCE(call->next_req_timo, p.call.timeouts.idle); fallthrough; case 1: if (p.call.timeouts.hard > 0) { ktime_t delay = ms_to_ktime(p.call.timeouts.hard * MSEC_PER_SEC); WRITE_ONCE(call->expect_term_by, ktime_add(p.call.timeouts.hard, ktime_get_real())); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); } break; } if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; goto out_put_unlock; } switch (p.command) { case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; break; case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); break; default: ret = -EINVAL; break; } out_put_unlock: if (!dropped_lock) mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put_sendmsg); _leave(" = %d", ret); return ret; error_release_sock: release_sock(&rx->sk); return ret; } /** * rxrpc_kernel_send_data - Allow a kernel service to send data on a call * @sock: The socket the call is on * @call: The call to send data through * @msg: The data to send * @len: The amount of data to send * @notify_end_tx: Notification that the last packet is queued. * * Allow a kernel service to send data on a call. The call must be in an state * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. * * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx) { bool dropped_lock = false; int ret; _enter("{%d},", call->debug_id); ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); mutex_lock(&call->user_mutex); ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, notify_end_tx, &dropped_lock); if (ret == -ESHUTDOWN) ret = call->error; if (!dropped_lock) mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } EXPORT_SYMBOL(rxrpc_kernel_send_data); /** * rxrpc_kernel_abort_call - Allow a kernel service to abort a call * @sock: The socket the call is on * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet * @error: Local error value * @why: Indication as to why. * * Allow a kernel service to abort a call if it's still in an abortable state. * * Return: %true if the call was aborted, %false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) { bool aborted; _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); aborted = rxrpc_propose_abort(call, abort_code, error, why); mutex_unlock(&call->user_mutex); return aborted; } EXPORT_SYMBOL(rxrpc_kernel_abort_call); /** * rxrpc_kernel_set_tx_length - Set the total Tx length on a call * @sock: The socket the call is on * @call: The call to be informed * @tx_total_len: The amount of data to be transmitted for this call * * Allow a kernel service to set the total transmit length on a call. This * allows buffer-to-packet encrypt-and-copy to be performed. * * This function is primarily for use for setting the reply length since the * request length can be set when beginning the call. */ void rxrpc_kernel_set_tx_length(struct socket *sock, struct rxrpc_call *call, s64 tx_total_len) { WARN_ON(call->tx_total_len != -1); call->tx_total_len = tx_total_len; } EXPORT_SYMBOL(rxrpc_kernel_set_tx_length);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking expectations. */ #ifndef _NF_CONNTRACK_EXPECT_H #define _NF_CONNTRACK_EXPECT_H #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_zones.h> extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; extern struct hlist_head *nf_ct_expect_hash; struct nf_conntrack_expect { /* Conntrack expectation list member */ struct hlist_node lnode; /* Hash member */ struct hlist_node hnode; /* We expect this tuple, with the following mask */ struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_mask mask; /* Usage count. */ refcount_t use; /* Flags */ unsigned int flags; /* Expectation class */ unsigned int class; /* Function to call after setup and insertion */ void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); /* Helper to assign to new connection */ struct nf_conntrack_helper *helper; /* The conntrack of the master connection */ struct nf_conn *master; /* Timer function; deletes the expectation. */ struct timer_list timeout; #if IS_ENABLED(CONFIG_NF_NAT) union nf_inet_addr saved_addr; /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ union nf_conntrack_man_proto saved_proto; /* Direction relative to the master connection. */ enum ip_conntrack_dir dir; #endif struct rcu_head rcu; }; static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp) { return nf_ct_net(exp->master); } #define NF_CT_EXP_POLICY_NAME_LEN 16 struct nf_conntrack_expect_policy { unsigned int max_expected; unsigned int timeout; char name[NF_CT_EXP_POLICY_NAME_LEN]; }; #define NF_CT_EXPECT_CLASS_DEFAULT 0 #define NF_CT_EXPECT_MAX_CNT 255 /* Allow to reuse expectations with the same tuples from different master * conntracks. */ #define NF_CT_EXP_F_SKIP_MASTER 0x1 int nf_conntrack_expect_pernet_init(struct net *net); void nf_conntrack_expect_pernet_fini(struct net *net); int nf_conntrack_expect_init(void); void nf_conntrack_expect_fini(void); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, bool unlink); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { nf_ct_unlink_expect_report(exp, 0, 0); } void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); bool nf_ct_remove_expect(struct nf_conntrack_expect *exp); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report); /* Allocate space for an expectation: this is mandatory before calling nf_ct_expect_related. You will have to call put afterwards. */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me); void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, const union nf_inet_addr *, const union nf_inet_addr *, u_int8_t, const __be16 *, const __be16 *); void nf_ct_expect_put(struct nf_conntrack_expect *exp); int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags); static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect, unsigned int flags) { return nf_ct_expect_related_report(expect, 0, 0, flags); } #endif /*_NF_CONNTRACK_EXPECT_H*/
77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/rculist.h> #include <linux/cookie.h> extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; extern struct ns_tree mnt_ns_tree; extern struct ns_tree net_ns_tree; extern struct ns_tree pid_ns_tree; extern struct ns_tree time_ns_tree; extern struct ns_tree user_ns_tree; extern struct ns_tree uts_ns_tree; #define to_ns_tree(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(cgroup_ns_tree), \ struct ipc_namespace *: &(ipc_ns_tree), \ struct net *: &(net_ns_tree), \ struct pid_namespace *: &(pid_ns_tree), \ struct mnt_namespace *: &(mnt_ns_tree), \ struct time_namespace *: &(time_ns_tree), \ struct user_namespace *: &(user_ns_tree), \ struct uts_namespace *: &(uts_ns_tree)) u64 ns_tree_gen_id(struct ns_common *ns); void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, struct ns_tree *ns_tree, bool previous); static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) { ns_tree_gen_id(ns); __ns_tree_add_raw(ns, ns_tree); } /** * ns_tree_add_raw - Add a namespace to a namespace * @ns: Namespace to add * * This function adds a namespace to the appropriate namespace tree * without assigning a id. */ #define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) /** * ns_tree_add - Add a namespace to a namespace tree * @ns: Namespace to add * * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. */ #define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) /** * ns_tree_remove - Remove a namespace from a namespace tree * @ns: Namespace to remove * * This function removes a namespace from the appropriate namespace * tree and list. */ #define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) #define ns_tree_adjoined_rcu(__ns, __previous) \ __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) #define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) #endif /* _LINUX_NSTREE_H */
20 30 7 30 30 20 30 5 5 20 5 5 5 5 5 5 20 20 5 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 30 30 30 30 30 30 30 22 22 31 31 31 23 10 6 2 4 10 7 7 7 10 31 31 31 1 30 31 23 1 1 22 22 22 22 2 2 3 20 20 30 30 30 30 27 27 30 30 20 20 30 30 30 30 30 20 29 20 30 30 7 7 7 7 7 30 30 30 30 30 30 30 30 30 20 20 20 20 20 20 20 20 30 30 7 30 30 30 30 30 30 30 30 30 30 20 20 20 20 20 20 20 30 30 30 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/crc32c.h> #include <linux/ctype.h> #include <linux/highmem.h> #include <linux/inet.h> #include <linux/kthread.h> #include <linux/net.h> #include <linux/nsproxy.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> #ifdef CONFIG_BLOCK #include <linux/bio.h> #endif /* CONFIG_BLOCK */ #include <linux/dns_resolver.h> #include <net/tcp.h> #include <trace/events/sock.h> #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> #include <linux/export.h> /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable * delivery. We tolerate TCP disconnects by reconnecting (with * exponential backoff) in the case of a fault (disconnection, bad * crc, protocol error). Acks allow sent messages to be discarded by * the sender. */ /* * We track the state of the socket on a given connection using * values defined below. The transition to a new socket state is * handled by a function which verifies we aren't coming from an * unexpected state. * * -------- * | NEW* | transient initial state * -------- * | con_sock_state_init() * v * ---------- * | CLOSED | initialized, but no socket (and no * ---------- TCP connection) * ^ \ * | \ con_sock_state_connecting() * | ---------------------- * | \ * + con_sock_state_closed() \ * |+--------------------------- \ * | \ \ \ * | ----------- \ \ * | | CLOSING | socket event; \ \ * | ----------- await close \ \ * | ^ \ | * | | \ | * | + con_sock_state_closing() \ | * | / \ | | * | / --------------- | | * | / \ v v * | / -------------- * | / -----------------| CONNECTING | socket created, TCP * | | / -------------- connect initiated * | | | con_sock_state_connected() * | | v * ------------- * | CONNECTED | TCP connection established * ------------- * * State values for ceph_connection->sock_state; NEW is assumed to be 0. */ #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ #define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ static bool con_flag_valid(unsigned long con_flag) { switch (con_flag) { case CEPH_CON_F_LOSSYTX: case CEPH_CON_F_KEEPALIVE_PENDING: case CEPH_CON_F_WRITE_PENDING: case CEPH_CON_F_SOCK_CLOSED: case CEPH_CON_F_BACKOFF: return true; default: return false; } } void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); clear_bit(con_flag, &con->flags); } void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); set_bit(con_flag, &con->flags); } bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_bit(con_flag, &con->flags); } bool ceph_con_flag_test_and_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_clear_bit(con_flag, &con->flags); } bool ceph_con_flag_test_and_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_set_bit(con_flag, &con->flags); } /* Slab caches for frequently-allocated structures */ static struct kmem_cache *ceph_msg_cache; #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; #endif static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); static void ceph_con_workfn(struct work_struct *); static void con_fault(struct ceph_connection *con); /* * Nicely render a sockaddr as a string. An array of formatted * strings is used, to approximate reentrancy. */ #define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ #define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) #define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) #define MAX_ADDR_STR_LEN 64 /* 54 is enough */ static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); struct page *ceph_zero_page; /* used in certain error cases */ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { int i; char *s; struct sockaddr_storage ss = addr->in_addr; /* align */ struct sockaddr_in *in4 = (struct sockaddr_in *)&ss; struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; switch (ss.ss_family) { case AF_INET: snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu", le32_to_cpu(addr->type), &in4->sin_addr, ntohs(in4->sin_port)); break; case AF_INET6: snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu", le32_to_cpu(addr->type), &in6->sin6_addr, ntohs(in6->sin6_port)); break; default: snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", ss.ss_family); } return s; } EXPORT_SYMBOL(ceph_pr_addr); void ceph_encode_my_addr(struct ceph_messenger *msgr) { if (!ceph_msgr2(from_msgr(msgr))) { memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); ceph_encode_banner_addr(&msgr->my_enc_addr); } } /* * work queue for all reading and writing to/from the socket. */ static struct workqueue_struct *ceph_msgr_wq; static int ceph_msgr_slab_init(void) { BUG_ON(ceph_msg_cache); ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); if (!ceph_msg_cache) return -ENOMEM; return 0; } static void ceph_msgr_slab_exit(void) { BUG_ON(!ceph_msg_cache); kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; } static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { destroy_workqueue(ceph_msgr_wq); ceph_msgr_wq = NULL; } BUG_ON(!ceph_zero_page); put_page(ceph_zero_page); ceph_zero_page = NULL; ceph_msgr_slab_exit(); } int __init ceph_msgr_init(void) { if (ceph_msgr_slab_init()) return -ENOMEM; BUG_ON(ceph_zero_page); ceph_zero_page = ZERO_PAGE(0); get_page(ceph_zero_page); /* * The number of active work items is limited by the number of * connections, so leave @max_active at default. */ ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (ceph_msgr_wq) return 0; pr_err("msgr_init failed to create workqueue\n"); _ceph_msgr_exit(); return -ENOMEM; } void ceph_msgr_exit(void) { BUG_ON(ceph_msgr_wq == NULL); _ceph_msgr_exit(); } void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } EXPORT_SYMBOL(ceph_msgr_flush); /* Connection socket state transition functions */ static void con_sock_state_init(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } static void con_sock_state_connecting(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTING); } static void con_sock_state_connected(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTED); } static void con_sock_state_closing(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSING); } static void con_sock_state_closed(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING && old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } /* * socket callback functions */ /* data available on socket, or listen socket received a connect */ static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; trace_sk_data_ready(sk); if (atomic_read(&con->msgr->stopping)) { return; } if (sk->sk_state != TCP_CLOSE_WAIT) { dout("%s %p state = %d, queueing work\n", __func__, con, con->state); queue_con(con); } } /* socket has buffer space for writing */ static void ceph_sock_write_space(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() * doesn't get called again until try_write() fills the socket * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); } } else { dout("%s %p nothing to write\n", __func__, con); } } /* socket's state has changed */ static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; dout("%s %p state = %d sk_state = %u\n", __func__, con, con->state, sk->sk_state); switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); fallthrough; case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: dout("%s TCP_ESTABLISHED\n", __func__); con_sock_state_connected(con); queue_con(con); break; default: /* Everything else is uninteresting */ break; } } /* * set up socket callbacks */ static void set_sock_callbacks(struct socket *sock, struct ceph_connection *con) { struct sock *sk = sock->sk; sk->sk_user_data = con; sk->sk_data_ready = ceph_sock_data_ready; sk->sk_write_space = ceph_sock_write_space; sk->sk_state_change = ceph_sock_state_change; } /* * socket helpers */ /* * initiate connection to a remote socket. */ int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; dout("%s con %p peer_addr %s\n", __func__, con, ceph_pr_addr(&con->peer_addr)); BUG_ON(con->sock); /* sock_create_kern() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); memalloc_noio_restore(noio_flag); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; sock->sk->sk_use_task_frag = false; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); #endif set_sock_callbacks(sock, con); con_sock_state_connecting(con); ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", ceph_pr_addr(&con->peer_addr), sock->sk->sk_state); } else if (ret < 0) { pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr), ret); sock_release(sock); return ret; } if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; } /* * Shutdown/close the socket for the given connection. */ int ceph_con_close_socket(struct ceph_connection *con) { int rc = 0; dout("%s con %p sock %p\n", __func__, con, con->sock); if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); con->sock = NULL; } /* * Forcibly clear the SOCK_CLOSED flag. It gets set * independent of the connection mutex, and we could have * received a socket close event before we had the chance to * shut the socket down. */ ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); con_sock_state_closed(con); return rc; } static void ceph_con_reset_protocol(struct ceph_connection *con) { dout("%s con %p\n", __func__, con); ceph_con_close_socket(con); if (con->in_msg) { WARN_ON(con->in_msg->con != con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } if (con->out_msg) { WARN_ON(con->out_msg->con != con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } if (con->bounce_page) { __free_page(con->bounce_page); con->bounce_page = NULL; } if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_protocol(con); else ceph_con_v1_reset_protocol(con); } /* * Reset a connection. Discard all incoming and outgoing messages * and clear *_seq state. */ static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); ceph_msg_put(msg); } static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, list_head); ceph_msg_remove(msg); } } void ceph_con_reset_session(struct ceph_connection *con) { dout("%s con %p\n", __func__, con); WARN_ON(con->in_msg); WARN_ON(con->out_msg); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); con->out_seq = 0; con->in_seq = 0; con->in_seq_acked = 0; if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_session(con); else ceph_con_v1_reset_session(con); } /* * mark a peer down. drop any open connections. */ void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); con->state = CEPH_CON_S_CLOSED; ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next connect */ ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); ceph_con_reset_protocol(con); ceph_con_reset_session(con); cancel_con(con); mutex_unlock(&con->mutex); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ void ceph_con_open(struct ceph_connection *con, __u8 entity_type, __u64 entity_num, struct ceph_entity_addr *addr) { mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(addr)); WARN_ON(con->state != CEPH_CON_S_CLOSED); con->state = CEPH_CON_S_PREOPEN; con->peer_name.type = (__u8) entity_type; con->peer_name.num = cpu_to_le64(entity_num); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); /* * return true if this connection ever successfully opened */ bool ceph_con_opened(struct ceph_connection *con) { if (ceph_msgr2(from_msgr(con->msgr))) return ceph_con_v2_opened(con); return ceph_con_v1_opened(con); } /* * initialize a new connection. */ void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); con->private = private; con->ops = ops; con->msgr = msgr; con_sock_state_init(con); mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, ceph_con_workfn); con->state = CEPH_CON_S_CLOSED; } EXPORT_SYMBOL(ceph_con_init); /* * We maintain a global counter to order connection attempts. Get * a unique seq greater than @gt. */ u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) { u32 ret; spin_lock(&msgr->global_seq_lock); if (msgr->global_seq < gt) msgr->global_seq = gt; ret = ++msgr->global_seq; spin_unlock(&msgr->global_seq_lock); return ret; } /* * Discard messages that have been acked by the server. */ void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) { struct ceph_msg *msg; u64 seq; dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); while (!list_empty(&con->out_sent)) { msg = list_first_entry(&con->out_sent, struct ceph_msg, list_head); WARN_ON(msg->needs_out_seq); seq = le64_to_cpu(msg->hdr.seq); if (seq > ack_seq) break; dout("%s con %p discarding msg %p seq %llu\n", __func__, con, msg, seq); ceph_msg_remove(msg); } } /* * Discard messages that have been requeued in con_fault(), up to * reconnect_seq. This avoids gratuitously resending messages that * the server had received and handled prior to reconnect. */ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) { struct ceph_msg *msg; u64 seq; dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); while (!list_empty(&con->out_queue)) { msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); if (msg->needs_out_seq) break; seq = le64_to_cpu(msg->hdr.seq); if (seq > reconnect_seq) break; dout("%s con %p discarding msg %p seq %llu\n", __func__, con, msg, seq); ceph_msg_remove(msg); } } #ifdef CONFIG_BLOCK /* * For a bio data item, a piece is whatever remains of the next * entry in the current bio iovec, or the first entry in the next * bio in the list. */ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_bio_iter *it = &cursor->bio_iter; cursor->resid = min_t(size_t, length, data->bio_length); *it = data->bio_pos; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, cursor->bio_iter.iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_bio_iter *it = &cursor->bio_iter; struct page *page = bio_iter_page(it->bio, it->iter); BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); cursor->resid -= bytes; bio_advance_iter(it->bio, &it->iter, bytes); if (!cursor->resid) return false; /* no more data */ if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done && page == bio_iter_page(it->bio, it->iter))) return false; /* more bytes to process in this segment */ if (!it->iter.bi_size) { it->bio = it->bio->bi_next; it->iter = it->bio->bi_iter; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; } BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); return true; } #endif /* CONFIG_BLOCK */ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct bio_vec *bvecs = data->bvec_pos.bvecs; cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); cursor->bvec_iter = data->bvec_pos.iter; cursor->bvec_iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); } static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, cursor->bvec_iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter); BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); cursor->resid -= bytes; bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); if (!cursor->resid) return false; /* no more data */ if (!bytes || (cursor->bvec_iter.bi_bvec_done && page == bvec_iter_page(bvecs, cursor->bvec_iter))) return false; /* more bytes to process in this segment */ BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); return true; } /* * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. */ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; int page_count; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(!data->pages); BUG_ON(!data->length); cursor->resid = min(length, data->length); page_count = calc_pages_for(data->alignment, (u64)data->length); cursor->page_offset = data->alignment & ~PAGE_MASK; cursor->page_index = 0; BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); } static struct page * ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_index >= cursor->page_count); BUG_ON(cursor->page_offset >= PAGE_SIZE); *page_offset = cursor->page_offset; *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return data->pages[cursor->page_index]; } static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); /* Advance the cursor page offset */ cursor->resid -= bytes; cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); cursor->page_index++; return true; } /* * For a pagelist, a piece is whatever remains to be consumed in the * first page in the list, or the front of the next page. */ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; struct page *page; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); if (!length) return; /* pagelist can be assigned but empty */ BUG_ON(list_empty(&pagelist->head)); page = list_first_entry(&pagelist->head, struct page, lru); cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; } static struct page * ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(!cursor->page); BUG_ON(cursor->offset + cursor->resid != pagelist->length); /* offset of first page in pagelist is always 0 */ *page_offset = cursor->offset & ~PAGE_MASK; *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return cursor->page; } static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(cursor->offset + cursor->resid != pagelist->length); BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); /* Advance the cursor offset */ cursor->resid -= bytes; cursor->offset += bytes; /* offset of first page in pagelist is always 0 */ if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); cursor->page = list_next_entry(cursor->page, lru); return true; } static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; cursor->iov_iter = data->iter; cursor->lastlen = 0; iov_iter_truncate(&cursor->iov_iter, length); cursor->resid = iov_iter_count(&cursor->iov_iter); } static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct page *page; ssize_t len; if (cursor->lastlen) iov_iter_revert(&cursor->iov_iter, cursor->lastlen); len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE, 1, page_offset); BUG_ON(len < 0); cursor->lastlen = len; /* * FIXME: The assumption is that the pages represented by the iov_iter * are pinned, with the references held by the upper-level * callers, or by virtue of being under writeback. Eventually, * we'll get an iov_iter_get_pages2 variant that doesn't take * page refs. Until then, just put the page ref. */ VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page); put_page(page); *length = min_t(size_t, len, cursor->resid); return page; } static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { BUG_ON(bytes > cursor->resid); cursor->resid -= bytes; if (bytes < cursor->lastlen) { cursor->lastlen -= bytes; } else { iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen); cursor->lastlen = 0; } return cursor->resid; } /* * Message data is handled (sent or received) in pieces, where each * piece resides on a single page. The network layer might not * consume an entire piece at once. A data item's cursor keeps * track of which piece is next to process and how much remains to * be processed in that piece. It also tracks whether the current * piece is the last one in the data item. */ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) { size_t length = cursor->total_resid; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(cursor, length); break; case CEPH_MSG_DATA_PAGES: ceph_msg_data_pages_cursor_init(cursor, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: ceph_msg_data_bvecs_cursor_init(cursor, length); break; case CEPH_MSG_DATA_ITER: ceph_msg_data_iter_cursor_init(cursor, length); break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ break; } cursor->need_crc = true; } void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, struct ceph_msg *msg, size_t length) { BUG_ON(!length); BUG_ON(length > msg->data_length); BUG_ON(!msg->num_data_items); cursor->total_resid = length; cursor->data = msg->data; cursor->sr_resid = 0; __ceph_msg_data_cursor_init(cursor); } /* * Return the page containing the next piece to process for a given * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct page *page; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: page = ceph_msg_data_pagelist_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_PAGES: page = ceph_msg_data_pages_next(cursor, page_offset, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: page = ceph_msg_data_bvecs_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_ITER: page = ceph_msg_data_iter_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_NONE: default: page = NULL; break; } BUG_ON(!page); BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); BUG_ON(*length > cursor->resid); return page; } /* * Returns true if the result moves the cursor on to the next piece * of the data item. */ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { bool new_piece; BUG_ON(bytes > cursor->resid); switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); break; case CEPH_MSG_DATA_PAGES: new_piece = ceph_msg_data_pages_advance(cursor, bytes); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); break; case CEPH_MSG_DATA_ITER: new_piece = ceph_msg_data_iter_advance(cursor, bytes); break; case CEPH_MSG_DATA_NONE: default: BUG(); break; } cursor->total_resid -= bytes; if (!cursor->resid && cursor->total_resid) { cursor->data++; __ceph_msg_data_cursor_init(cursor); new_piece = true; } cursor->need_crc = new_piece; } u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, unsigned int length) { char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); crc = crc32c(crc, kaddr + page_offset, length); kunmap(page); return crc; } bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) { struct sockaddr_storage ss = addr->in_addr; /* align */ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr; switch (ss.ss_family) { case AF_INET: return addr4->s_addr == htonl(INADDR_ANY); case AF_INET6: return ipv6_addr_any(addr6); default: return true; } } EXPORT_SYMBOL(ceph_addr_is_blank); int ceph_addr_port(const struct ceph_entity_addr *addr) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port)); case AF_INET6: return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port)); } return 0; } void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port); break; case AF_INET6: put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port); break; } } /* * Unlike other *_pton function semantics, zero indicates success. */ static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr, char delim, const char **ipend) { memset(&addr->in_addr, 0, sizeof(addr->in_addr)); if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) { put_unaligned(AF_INET, &addr->in_addr.ss_family); return 0; } if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) { put_unaligned(AF_INET6, &addr->in_addr.ss_family); return 0; } return -EINVAL; } /* * Extract hostname string and resolve using kernel DNS facility. */ #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER static int ceph_dns_resolve_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { const char *end, *delim_p; char *colon_p, *ip_addr = NULL; int ip_len, ret; /* * The end of the hostname occurs immediately preceding the delimiter or * the port marker (':') where the delimiter takes precedence. */ delim_p = memchr(name, delim, namelen); colon_p = memchr(name, ':', namelen); if (delim_p && colon_p) end = min(delim_p, colon_p); else if (!delim_p && colon_p) end = colon_p; else { end = delim_p; if (!end) /* case: hostname:/ */ end = name + namelen; } if (end <= name) return -EINVAL; /* do dns_resolve upcall */ ip_len = dns_query(current->nsproxy->net_ns, NULL, name, end - name, NULL, &ip_addr, NULL, false); if (ip_len > 0) ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL); else ret = -ESRCH; kfree(ip_addr); *ipend = end; pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, ret, ret ? "failed" : ceph_pr_addr(addr)); return ret; } #else static inline int ceph_dns_resolve_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { return -EINVAL; } #endif /* * Parse a server name (IP or hostname). If a valid IP address is not found * then try to extract a hostname to resolve using userspace DNS upcall. */ static int ceph_parse_server_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { int ret; ret = ceph_pton(name, namelen, addr, delim, ipend); if (ret) ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend); return ret; } /* * Parse an ip[:port] list into an addr array. Use the default * monitor port if a port isn't specified. */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count, char delim) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { char cur_delim = delim; const char *ipend; int port; if (*p == '[') { cur_delim = ']'; p++; } ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; if (cur_delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; } p++; } /* port? */ if (p < end && *p == ':') { port = 0; p++; while (p < end && *p >= '0' && *p <= '9') { port = (port * 10) + (*p - '0'); p++; } if (port == 0) port = CEPH_MON_PORT; else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; } ceph_addr_set_port(&addr[i], port); /* * We want the type to be set according to ms_mode * option, but options are normally parsed after mon * addresses. Rather than complicating parsing, set * to LEGACY and override in build_initial_monmap() * for mon addresses and ceph_messenger_init() for * ip option. */ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; addr[i].nonce = 0; dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); if (p == end) break; if (*p != delim) goto bad; p++; } if (p != end) goto bad; if (count) *count = i + 1; return 0; bad: return ret; } /* * Process message. This happens in the worker thread. The callback should * be careful not to do anything that waits on other incoming messages or it * may deadlock. */ void ceph_con_process_message(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; BUG_ON(con->in_msg->con != con); con->in_msg = NULL; /* if first message, set peer_name */ if (con->peer_name.type == 0) con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); mutex_lock(&con->mutex); } /* * Atomically queue work on a connection after the specified delay. * Bump @con reference to avoid races with connection teardown. * Returns 0 if work was queued, or an error code otherwise. */ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) { if (!con->ops->get(con)) { dout("%s %p ref count 0\n", __func__, con); return -ENOENT; } if (delay >= HZ) delay = round_jiffies_relative(delay); dout("%s %p %lu\n", __func__, con, delay); if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); con->ops->put(con); return -EBUSY; } return 0; } static void queue_con(struct ceph_connection *con) { (void) queue_con_delay(con, 0); } static void cancel_con(struct ceph_connection *con) { if (cancel_delayed_work(&con->work)) { dout("%s %p\n", __func__, con); con->ops->put(con); } } static bool con_sock_closed(struct ceph_connection *con) { if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) return false; #define CASE(x) \ case CEPH_CON_S_ ## x: \ con->error_msg = "socket closed (con state " #x ")"; \ break; switch (con->state) { CASE(CLOSED); CASE(PREOPEN); CASE(V1_BANNER); CASE(V1_CONNECT_MSG); CASE(V2_BANNER_PREFIX); CASE(V2_BANNER_PAYLOAD); CASE(V2_HELLO); CASE(V2_AUTH); CASE(V2_AUTH_SIGNATURE); CASE(V2_SESSION_CONNECT); CASE(V2_SESSION_RECONNECT); CASE(OPEN); CASE(STANDBY); default: BUG(); } #undef CASE return true; } static bool con_backoff(struct ceph_connection *con) { int ret; if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) return false; ret = queue_con_delay(con, con->delay); if (ret) { dout("%s: con %p FAILED to back off %lu\n", __func__, con, con->delay); BUG_ON(ret == -ENOENT); ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); } return true; } /* Finish fault handling; con->mutex must *not* be held here */ static void con_fault_finish(struct ceph_connection *con) { dout("%s %p\n", __func__, con); /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); con->v1.auth_retry = 0; } if (con->ops->fault) con->ops->fault(con); } /* * Do some work on a connection. Drop a connection ref when we're done. */ static void ceph_con_workfn(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); bool fault; mutex_lock(&con->mutex); while (true) { int ret; if ((fault = con_sock_closed(con))) { dout("%s: con %p SOCK_CLOSED\n", __func__, con); break; } if (con_backoff(con)) { dout("%s: con %p BACKOFF\n", __func__, con); break; } if (con->state == CEPH_CON_S_STANDBY) { dout("%s: con %p STANDBY\n", __func__, con); break; } if (con->state == CEPH_CON_S_CLOSED) { dout("%s: con %p CLOSED\n", __func__, con); BUG_ON(con->sock); break; } if (con->state == CEPH_CON_S_PREOPEN) { dout("%s: con %p PREOPEN\n", __func__, con); BUG_ON(con->sock); } if (ceph_msgr2(from_msgr(con->msgr))) ret = ceph_con_v2_try_read(con); else ret = ceph_con_v1_try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on read"; fault = true; break; } if (ceph_msgr2(from_msgr(con->msgr))) ret = ceph_con_v2_try_write(con); else ret = ceph_con_v1_try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on write"; fault = true; } break; /* If we make it to here, we're done */ } if (fault) con_fault(con); mutex_unlock(&con->mutex); if (fault) con_fault_finish(con); con->ops->put(con); } /* * Generic error/fault handler. A retry mechanism is used with * exponential backoff */ static void con_fault(struct ceph_connection *con) { dout("fault %p state %d to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; WARN_ON(con->state == CEPH_CON_S_STANDBY || con->state == CEPH_CON_S_CLOSED); ceph_con_reset_protocol(con); if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); con->state = CEPH_CON_S_CLOSED; return; } /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); con->state = CEPH_CON_S_STANDBY; } else { /* retry after a delay. */ con->state = CEPH_CON_S_PREOPEN; if (!con->delay) { con->delay = BASE_DELAY_INTERVAL; } else if (con->delay < MAX_DELAY_INTERVAL) { con->delay *= 2; if (con->delay > MAX_DELAY_INTERVAL) con->delay = MAX_DELAY_INTERVAL; } ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); queue_con(con); } } void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) { u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; msgr->inst.addr.nonce = cpu_to_le32(nonce); ceph_encode_my_addr(msgr); } /* * initialize a new messenger instance */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr) { spin_lock_init(&msgr->global_seq_lock); if (myaddr) { memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, sizeof(msgr->inst.addr.in_addr)); ceph_addr_set_port(&msgr->inst.addr, 0); } /* * Since nautilus, clients are identified using type ANY. * For msgr1, ceph_encode_banner_addr() munges it to NONE. */ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; /* generate a random non-zero nonce */ do { get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); } while (!msgr->inst.addr.nonce); ceph_encode_my_addr(msgr); atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } void ceph_messenger_fini(struct ceph_messenger *msgr) { put_net(read_pnet(&msgr->net)); } static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) { if (msg->con) msg->con->ops->put(msg->con); msg->con = con ? con->ops->get(con) : NULL; BUG_ON(msg->con != con); } static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { dout("clear_standby %p\n", con); con->state = CEPH_CON_S_PREOPEN; if (!ceph_msgr2(from_msgr(con->msgr))) con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } } /* * Queue up an outgoing message on the given connection. * * Consumes a ref on @msg. */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { /* set src+dst */ msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); msg->needs_out_seq = true; mutex_lock(&con->mutex); if (con->state == CEPH_CON_S_CLOSED) { dout("con_send %p closed, dropping %p\n", con, msg); ceph_msg_put(msg); mutex_unlock(&con->mutex); return; } msg_con_set(msg, con); BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send */ void ceph_msg_revoke(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (list_empty(&msg->list_head)) { WARN_ON(con->out_msg == msg); dout("%s con %p msg %p not linked\n", __func__, con, msg); mutex_unlock(&con->mutex); return; } dout("%s con %p msg %p was linked\n", __func__, con, msg); msg->hdr.seq = 0; ceph_msg_remove(msg); if (con->out_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_revoke(con, msg); else ceph_con_v1_revoke(con, msg); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { dout("%s con %p msg %p not current, out_msg %p\n", __func__, con, msg, con->out_msg); } mutex_unlock(&con->mutex); } /* * Revoke a message that we may be reading data into */ void ceph_msg_revoke_incoming(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (con->in_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was recving\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_revoke_incoming(con); else ceph_con_v1_revoke_incoming(con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } else { dout("%s con %p msg %p not current, in_msg %p\n", __func__, con, msg, con->in_msg); } mutex_unlock(&con->mutex); } /* * Queue a keepalive byte to ensure the tcp connection is alive. */ void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); bool ceph_con_keepalive_expired(struct ceph_connection *con, unsigned long interval) { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { struct timespec64 now; struct timespec64 ts; ktime_get_real_ts64(&now); jiffies_to_timespec64(interval, &ts); ts = timespec64_add(con->last_keepalive_ack, ts); return timespec64_compare(&now, &ts) >= 0; } return false; } static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) { BUG_ON(msg->num_data_items >= msg->max_data_items); return &msg->data[msg->num_data_items++]; } static void ceph_msg_data_destroy(struct ceph_msg_data *data) { if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) { int num_pages = calc_pages_for(data->alignment, data->length); ceph_release_page_vector(data->pages, num_pages); } else if (data->type == CEPH_MSG_DATA_PAGELIST) { ceph_pagelist_release(data->pagelist); } } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment, bool own_pages) { struct ceph_msg_data *data; BUG_ON(!pages); BUG_ON(!length); data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_PAGES; data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; data->own_pages = own_pages; msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_pages); void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { struct ceph_msg_data *data; BUG_ON(!pagelist); BUG_ON(!pagelist->length); data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_PAGELIST; refcount_inc(&pagelist->refcnt); data->pagelist = pagelist; msg->data_length += pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_add_pagelist); #ifdef CONFIG_BLOCK void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, u32 length) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_BIO; data->bio_pos = *bio_pos; data->bio_length = length; msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, struct ceph_bvec_iter *bvec_pos) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_BVECS; data->bvec_pos = *bvec_pos; msg->data_length += bvec_pos->iter.bi_size; } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); void ceph_msg_data_add_iter(struct ceph_msg *msg, struct iov_iter *iter) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_ITER; data->iter = *iter; msg->data_length += iov_iter_count(&data->iter); } /* * construct a new message with given type, size * the new msg has a ref count of 1. */ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, gfp_t flags, bool can_fail) { struct ceph_msg *m; m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); m->hdr.front_len = cpu_to_le32(front_len); INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); /* front */ if (front_len) { m->front.iov_base = kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); goto out2; } } else { m->front.iov_base = NULL; } m->front_alloc_len = m->front.iov_len = front_len; if (max_data_items) { m->data = kmalloc_array(max_data_items, sizeof(*m->data), flags); if (!m->data) goto out2; m->max_data_items = max_data_items; } dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: if (!can_fail) { pr_err("msg_new can't create type %d front %d\n", type, front_len); WARN_ON(1); } else { dout("msg_new can't create type %d front %d\n", type, front_len); } return NULL; } EXPORT_SYMBOL(ceph_msg_new2); struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail) { return ceph_msg_new2(type, front_len, 0, flags, can_fail); } EXPORT_SYMBOL(ceph_msg_new); /* * Allocate "middle" portion of a message, if it is needed and wasn't * allocated by alloc_msg. This allows us to read a small fixed-size * per-type header in the front and then gracefully fail (i.e., * propagate the error to the caller based on info in the front) when * the middle is too large. */ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); int middle_len = le32_to_cpu(msg->hdr.middle_len); dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, ceph_msg_type_name(type), middle_len); BUG_ON(!middle_len); BUG_ON(msg->middle); msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); if (!msg->middle) return -ENOMEM; return 0; } /* * Allocate a message for receiving an incoming message on a * connection, and save the result in con->in_msg. Uses the * connection's private alloc_msg op if available. * * Returns 0 on success, or a negative error code. * * On success, if we set *skip = 1: * - the next message should be skipped and ignored. * - con->in_msg == NULL * or if we set *skip = 0: * - con->in_msg is non-null. * On error (ENOMEM, EAGAIN, ...), * - con->in_msg == NULL */ int ceph_con_in_msg_alloc(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); BUG_ON(!con->ops->alloc_msg); mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); if (con->state != CEPH_CON_S_OPEN) { if (msg) ceph_msg_put(msg); return -EAGAIN; } if (msg) { BUG_ON(*skip); msg_con_set(msg, con); con->in_msg = msg; } else { /* * Null message pointer means either we should skip * this message or we couldn't allocate memory. The * former is not an error. */ if (*skip) return 0; con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); if (middle_len && !con->in_msg->middle) { ret = ceph_alloc_middle(con, con->in_msg); if (ret < 0) { ceph_msg_put(con->in_msg); con->in_msg = NULL; } } return ret; } struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; if (list_empty(&con->out_queue)) return NULL; msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); WARN_ON(msg->con != con); /* * Put the message on "sent" list using a ref from ceph_con_send(). * It is put when the message is acked or revoked. */ list_move_tail(&msg->list_head, &con->out_sent); /* * Only assign outgoing seq # if we haven't sent this message * yet. If it is requeued, resend with it's original seq. */ if (msg->needs_out_seq) { msg->hdr.seq = cpu_to_le64(++con->out_seq); msg->needs_out_seq = false; if (con->ops->reencode_message) con->ops->reencode_message(msg); } /* * Get a ref for out_msg. It is put when we are done sending the * message or in case of a fault. */ WARN_ON(con->out_msg); return con->out_msg = ceph_msg_get(msg); } /* * Free a generically kmalloc'd message. */ static void ceph_msg_free(struct ceph_msg *m) { dout("%s %p\n", __func__, m); kvfree(m->front.iov_base); kfree(m->data); kmem_cache_free(ceph_msg_cache, m); } static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); int i; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); msg_con_set(m, NULL); /* drop middle, data, if any */ if (m->middle) { ceph_buffer_put(m->middle); m->middle = NULL; } for (i = 0; i < m->num_data_items; i++) ceph_msg_data_destroy(&m->data[i]); if (m->pool) ceph_msgpool_put(m->pool, m); else ceph_msg_free(m); } struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_get(&msg->kref); return msg; } EXPORT_SYMBOL(ceph_msg_get); void ceph_msg_put(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_put(&msg->kref, ceph_msg_release); } EXPORT_SYMBOL(ceph_msg_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); print_hex_dump(KERN_DEBUG, " front: ", DUMP_PREFIX_OFFSET, 16, 1, msg->front.iov_base, msg->front.iov_len, true); if (msg->middle) print_hex_dump(KERN_DEBUG, "middle: ", DUMP_PREFIX_OFFSET, 16, 1, msg->middle->vec.iov_base, msg->middle->vec.iov_len, true); print_hex_dump(KERN_DEBUG, "footer: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->footer, sizeof(msg->footer), true); } EXPORT_SYMBOL(ceph_msg_dump);
160 7 147 231 229 231 231 231 230 231 231 4 4 4 3 3 3 1 1 2 160 160 116 145 116 426 233 200 418 389 392 57 56 9 9 9 8 9 9 9 9 9 9 426 427 232 232 230 232 199 200 427 427 401 417 149 392 415 416 418 416 418 418 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 // SPDX-License-Identifier: GPL-2.0-or-later /* * libata-core.c - helper library for ATA * * Copyright 2003-2004 Red Hat, Inc. All rights reserved. * Copyright 2003-2004 Jeff Garzik * * libata documentation is available via 'make {ps|pdf}docs', * as Documentation/driver-api/libata.rst * * Hardware documentation available from http://www.t13.org/ and * http://www.sata-io.org/ * * Standards documents from: * http://www.t13.org (ATA standards, PCI DMA IDE spec) * http://www.t10.org (SCSI MMC - for ATAPI MMC) * http://www.sata-io.org (SATA) * http://www.compactflash.org (CF) * http://www.qic.org (QIC157 - Tape and DSC) * http://www.ce-ata.org (CE-ATA: not supported) * * libata is essentially a library of internal helper functions for * low-level ATA host controller drivers. As such, the API/ABI is * likely to change as new drivers are added and updated. * Do not depend on ABI/API stability. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/init.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/timer.h> #include <linux/time.h> #include <linux/interrupt.h> #include <linux/completion.h> #include <linux/suspend.h> #include <linux/workqueue.h> #include <linux/scatterlist.h> #include <linux/io.h> #include <linux/log2.h> #include <linux/slab.h> #include <linux/glob.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> #include <linux/libata.h> #include <asm/byteorder.h> #include <linux/unaligned.h> #include <linux/cdrom.h> #include <linux/ratelimit.h> #include <linux/leds.h> #include <linux/pm_runtime.h> #include <linux/platform_device.h> #include <asm/setup.h> #define CREATE_TRACE_POINTS #include <trace/events/libata.h> #include "libata.h" #include "libata-transport.h" const struct ata_port_operations ata_base_port_ops = { .reset.prereset = ata_std_prereset, .reset.postreset = ata_std_postreset, .error_handler = ata_std_error_handler, .sched_eh = ata_std_sched_eh, .end_eh = ata_std_end_eh, }; static unsigned int ata_dev_init_params(struct ata_device *dev, u16 heads, u16 sectors); static unsigned int ata_dev_set_xfermode(struct ata_device *dev); static void ata_dev_xfermask(struct ata_device *dev); static unsigned int ata_dev_quirks(const struct ata_device *dev); static DEFINE_IDA(ata_ida); #ifdef CONFIG_ATA_FORCE struct ata_force_param { const char *name; u8 cbl; u8 spd_limit; unsigned int xfer_mask; unsigned int quirk_on; unsigned int quirk_off; unsigned int pflags_on; u16 lflags_on; u16 lflags_off; }; struct ata_force_ent { int port; int device; struct ata_force_param param; }; static struct ata_force_ent *ata_force_tbl; static int ata_force_tbl_size; static char ata_force_param_buf[COMMAND_LINE_SIZE] __initdata; /* param_buf is thrown away after initialization, disallow read */ module_param_string(force, ata_force_param_buf, sizeof(ata_force_param_buf), 0); MODULE_PARM_DESC(force, "Force ATA configurations including cable type, link speed and transfer mode (see Documentation/admin-guide/kernel-parameters.rst for details)"); #endif static int atapi_enabled = 1; module_param(atapi_enabled, int, 0444); MODULE_PARM_DESC(atapi_enabled, "Enable discovery of ATAPI devices (0=off, 1=on [default])"); static int atapi_dmadir = 0; module_param(atapi_dmadir, int, 0444); MODULE_PARM_DESC(atapi_dmadir, "Enable ATAPI DMADIR bridge support (0=off [default], 1=on)"); int atapi_passthru16 = 1; module_param(atapi_passthru16, int, 0444); MODULE_PARM_DESC(atapi_passthru16, "Enable ATA_16 passthru for ATAPI devices (0=off, 1=on [default])"); int libata_fua = 0; module_param_named(fua, libata_fua, int, 0444); MODULE_PARM_DESC(fua, "FUA support (0=off [default], 1=on)"); static int ata_ignore_hpa; module_param_named(ignore_hpa, ata_ignore_hpa, int, 0644); MODULE_PARM_DESC(ignore_hpa, "Ignore HPA limit (0=keep BIOS limits, 1=ignore limits, using full disk)"); static int libata_dma_mask = ATA_DMA_MASK_ATA|ATA_DMA_MASK_ATAPI|ATA_DMA_MASK_CFA; module_param_named(dma, libata_dma_mask, int, 0444); MODULE_PARM_DESC(dma, "DMA enable/disable (0x1==ATA, 0x2==ATAPI, 0x4==CF)"); static int ata_probe_timeout; module_param(ata_probe_timeout, int, 0444); MODULE_PARM_DESC(ata_probe_timeout, "Set ATA probing timeout (seconds)"); int libata_noacpi = 0; module_param_named(noacpi, libata_noacpi, int, 0444); MODULE_PARM_DESC(noacpi, "Disable the use of ACPI in probe/suspend/resume (0=off [default], 1=on)"); int libata_allow_tpm = 0; module_param_named(allow_tpm, libata_allow_tpm, int, 0444); MODULE_PARM_DESC(allow_tpm, "Permit the use of TPM commands (0=off [default], 1=on)"); static int atapi_an; module_param(atapi_an, int, 0444); MODULE_PARM_DESC(atapi_an, "Enable ATAPI AN media presence notification (0=0ff [default], 1=on)"); MODULE_AUTHOR("Jeff Garzik"); MODULE_DESCRIPTION("Library module for ATA devices"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); static inline bool ata_dev_print_info(const struct ata_device *dev) { struct ata_eh_context *ehc = &dev->link->eh_context; return ehc->i.flags & ATA_EHI_PRINTINFO; } /** * ata_link_next - link iteration helper * @link: the previous link, NULL to start * @ap: ATA port containing links to iterate * @mode: iteration mode, one of ATA_LITER_* * * LOCKING: * Host lock or EH context. * * RETURNS: * Pointer to the next link. */ struct ata_link *ata_link_next(struct ata_link *link, struct ata_port *ap, enum ata_link_iter_mode mode) { BUG_ON(mode != ATA_LITER_EDGE && mode != ATA_LITER_PMP_FIRST && mode != ATA_LITER_HOST_FIRST); /* NULL link indicates start of iteration */ if (!link) switch (mode) { case ATA_LITER_EDGE: case ATA_LITER_PMP_FIRST: if (sata_pmp_attached(ap)) return ap->pmp_link; fallthrough; case ATA_LITER_HOST_FIRST: return &ap->link; } /* we just iterated over the host link, what's next? */ if (link == &ap->link) switch (mode) { case ATA_LITER_HOST_FIRST: if (sata_pmp_attached(ap)) return ap->pmp_link; fallthrough; case ATA_LITER_PMP_FIRST: if (unlikely(ap->slave_link)) return ap->slave_link; fallthrough; case ATA_LITER_EDGE: return NULL; } /* slave_link excludes PMP */ if (unlikely(link == ap->slave_link)) return NULL; /* we were over a PMP link */ if (++link < ap->pmp_link + ap->nr_pmp_links) return link; if (mode == ATA_LITER_PMP_FIRST) return &ap->link; return NULL; } EXPORT_SYMBOL_GPL(ata_link_next); /** * ata_dev_next - device iteration helper * @dev: the previous device, NULL to start * @link: ATA link containing devices to iterate * @mode: iteration mode, one of ATA_DITER_* * * LOCKING: * Host lock or EH context. * * RETURNS: * Pointer to the next device. */ struct ata_device *ata_dev_next(struct ata_device *dev, struct ata_link *link, enum ata_dev_iter_mode mode) { BUG_ON(mode != ATA_DITER_ENABLED && mode != ATA_DITER_ENABLED_REVERSE && mode != ATA_DITER_ALL && mode != ATA_DITER_ALL_REVERSE); /* NULL dev indicates start of iteration */ if (!dev) switch (mode) { case ATA_DITER_ENABLED: case ATA_DITER_ALL: dev = link->device; goto check; case ATA_DITER_ENABLED_REVERSE: case ATA_DITER_ALL_REVERSE: dev = link->device + ata_link_max_devices(link) - 1; goto check; } next: /* move to the next one */ switch (mode) { case ATA_DITER_ENABLED: case ATA_DITER_ALL: if (++dev < link->device + ata_link_max_devices(link)) goto check; return NULL; case ATA_DITER_ENABLED_REVERSE: case ATA_DITER_ALL_REVERSE: if (--dev >= link->device) goto check; return NULL; } check: if ((mode == ATA_DITER_ENABLED || mode == ATA_DITER_ENABLED_REVERSE) && !ata_dev_enabled(dev)) goto next; return dev; } EXPORT_SYMBOL_GPL(ata_dev_next); /** * ata_dev_phys_link - find physical link for a device * @dev: ATA device to look up physical link for * * Look up physical link which @dev is attached to. Note that * this is different from @dev->link only when @dev is on slave * link. For all other cases, it's the same as @dev->link. * * LOCKING: * Don't care. * * RETURNS: * Pointer to the found physical link. */ struct ata_link *ata_dev_phys_link(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; if (!ap->slave_link) return dev->link; if (!dev->devno) return &ap->link; return ap->slave_link; } #ifdef CONFIG_ATA_FORCE /** * ata_force_cbl - force cable type according to libata.force * @ap: ATA port of interest * * Force cable type according to libata.force and whine about it. * The last entry which has matching port number is used, so it * can be specified as part of device force parameters. For * example, both "a:40c,1.00:udma4" and "1.00:40c,udma4" have the * same effect. * * LOCKING: * EH context. */ void ata_force_cbl(struct ata_port *ap) { int i; for (i = ata_force_tbl_size - 1; i >= 0; i--) { const struct ata_force_ent *fe = &ata_force_tbl[i]; if (fe->port != -1 && fe->port != ap->print_id) continue; if (fe->param.cbl == ATA_CBL_NONE) continue; ap->cbl = fe->param.cbl; ata_port_notice(ap, "FORCE: cable set to %s\n", fe->param.name); return; } } /** * ata_force_pflags - force port flags according to libata.force * @ap: ATA port of interest * * Force port flags according to libata.force and whine about it. * * LOCKING: * EH context. */ static void ata_force_pflags(struct ata_port *ap) { int i; for (i = ata_force_tbl_size - 1; i >= 0; i--) { const struct ata_force_ent *fe = &ata_force_tbl[i]; if (fe->port != -1 && fe->port != ap->print_id) continue; /* let pflags stack */ if (fe->param.pflags_on) { ap->pflags |= fe->param.pflags_on; ata_port_notice(ap, "FORCE: port flag 0x%x forced -> 0x%x\n", fe->param.pflags_on, ap->pflags); } } } /** * ata_force_link_limits - force link limits according to libata.force * @link: ATA link of interest * * Force link flags and SATA spd limit according to libata.force * and whine about it. When only the port part is specified * (e.g. 1:), the limit applies to all links connected to both * the host link and all fan-out ports connected via PMP. If the * device part is specified as 0 (e.g. 1.00:), it specifies the * first fan-out link not the host link. Device number 15 always * points to the host link whether PMP is attached or not. If the * controller has slave link, device number 16 points to it. * * LOCKING: * EH context. */ static void ata_force_link_limits(struct ata_link *link) { bool did_spd = false; int linkno = link->pmp; int i; if (ata_is_host_link(link)) linkno += 15; for (i = ata_force_tbl_size - 1; i >= 0; i--) { const struct ata_force_ent *fe = &ata_force_tbl[i]; if (fe->port != -1 && fe->port != link->ap->print_id) continue; if (fe->device != -1 && fe->device != linkno) continue; /* only honor the first spd limit */ if (!did_spd && fe->param.spd_limit) { link->hw_sata_spd_limit = (1 << fe->param.spd_limit) - 1; ata_link_notice(link, "FORCE: PHY spd limit set to %s\n", fe->param.name); did_spd = true; } /* let lflags stack */ if (fe->param.lflags_on) { link->flags |= fe->param.lflags_on; ata_link_notice(link, "FORCE: link flag 0x%x forced -> 0x%x\n", fe->param.lflags_on, link->flags); } if (fe->param.lflags_off) { link->flags &= ~fe->param.lflags_off; ata_link_notice(link, "FORCE: link flag 0x%x cleared -> 0x%x\n", fe->param.lflags_off, link->flags); } } } /** * ata_force_xfermask - force xfermask according to libata.force * @dev: ATA device of interest * * Force xfer_mask according to libata.force and whine about it. * For consistency with link selection, device number 15 selects * the first device connected to the host link. * * LOCKING: * EH context. */ static void ata_force_xfermask(struct ata_device *dev) { int devno = dev->link->pmp + dev->devno; int alt_devno = devno; int i; /* allow n.15/16 for devices attached to host port */ if (ata_is_host_link(dev->link)) alt_devno += 15; for (i = ata_force_tbl_size - 1; i >= 0; i--) { const struct ata_force_ent *fe = &ata_force_tbl[i]; unsigned int pio_mask, mwdma_mask, udma_mask; if (fe->port != -1 && fe->port != dev->link->ap->print_id) continue; if (fe->device != -1 && fe->device != devno && fe->device != alt_devno) continue; if (!fe->param.xfer_mask) continue; ata_unpack_xfermask(fe->param.xfer_mask, &pio_mask, &mwdma_mask, &udma_mask); if (udma_mask) dev->udma_mask = udma_mask; else if (mwdma_mask) { dev->udma_mask = 0; dev->mwdma_mask = mwdma_mask; } else { dev->udma_mask = 0; dev->mwdma_mask = 0; dev->pio_mask = pio_mask; } ata_dev_notice(dev, "FORCE: xfer_mask set to %s\n", fe->param.name); return; } } /** * ata_force_quirks - force quirks according to libata.force * @dev: ATA device of interest * * Force quirks according to libata.force and whine about it. * For consistency with link selection, device number 15 selects * the first device connected to the host link. * * LOCKING: * EH context. */ static void ata_force_quirks(struct ata_device *dev) { int devno = dev->link->pmp + dev->devno; int alt_devno = devno; int i; /* allow n.15/16 for devices attached to host port */ if (ata_is_host_link(dev->link)) alt_devno += 15; for (i = 0; i < ata_force_tbl_size; i++) { const struct ata_force_ent *fe = &ata_force_tbl[i]; if (fe->port != -1 && fe->port != dev->link->ap->print_id) continue; if (fe->device != -1 && fe->device != devno && fe->device != alt_devno) continue; if (!(~dev->quirks & fe->param.quirk_on) && !(dev->quirks & fe->param.quirk_off)) continue; dev->quirks |= fe->param.quirk_on; dev->quirks &= ~fe->param.quirk_off; ata_dev_notice(dev, "FORCE: modified (%s)\n", fe->param.name); } } #else static inline void ata_force_pflags(struct ata_port *ap) { } static inline void ata_force_link_limits(struct ata_link *link) { } static inline void ata_force_xfermask(struct ata_device *dev) { } static inline void ata_force_quirks(struct ata_device *dev) { } #endif /** * atapi_cmd_type - Determine ATAPI command type from SCSI opcode * @opcode: SCSI opcode * * Determine ATAPI command type from @opcode. * * LOCKING: * None. * * RETURNS: * ATAPI_{READ|WRITE|READ_CD|PASS_THRU|MISC} */ int atapi_cmd_type(u8 opcode) { switch (opcode) { case GPCMD_READ_10: case GPCMD_READ_12: return ATAPI_READ; case GPCMD_WRITE_10: case GPCMD_WRITE_12: case GPCMD_WRITE_AND_VERIFY_10: return ATAPI_WRITE; case GPCMD_READ_CD: case GPCMD_READ_CD_MSF: return ATAPI_READ_CD; case ATA_16: case ATA_12: if (atapi_passthru16) return ATAPI_PASS_THRU; fallthrough; default: return ATAPI_MISC; } } EXPORT_SYMBOL_GPL(atapi_cmd_type); static const u8 ata_rw_cmds[] = { /* pio multi */ ATA_CMD_READ_MULTI, ATA_CMD_WRITE_MULTI, ATA_CMD_READ_MULTI_EXT, ATA_CMD_WRITE_MULTI_EXT, 0, 0, 0, 0, /* pio */ ATA_CMD_PIO_READ, ATA_CMD_PIO_WRITE, ATA_CMD_PIO_READ_EXT, ATA_CMD_PIO_WRITE_EXT, 0, 0, 0, 0, /* dma */ ATA_CMD_READ, ATA_CMD_WRITE, ATA_CMD_READ_EXT, ATA_CMD_WRITE_EXT, 0, 0, 0, ATA_CMD_WRITE_FUA_EXT }; /** * ata_set_rwcmd_protocol - set taskfile r/w command and protocol * @dev: target device for the taskfile * @tf: taskfile to examine and configure * * Examine the device configuration and tf->flags to determine * the proper read/write command and protocol to use for @tf. * * LOCKING: * caller. */ static bool ata_set_rwcmd_protocol(struct ata_device *dev, struct ata_taskfile *tf) { u8 cmd; int index, fua, lba48, write; fua = (tf->flags & ATA_TFLAG_FUA) ? 4 : 0; lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0; write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0; if (dev->flags & ATA_DFLAG_PIO) { tf->protocol = ATA_PROT_PIO; index = dev->multi_count ? 0 : 8; } else if (lba48 && (dev->link->ap->flags & ATA_FLAG_PIO_LBA48)) { /* Unable to use DMA due to host limitation */ tf->protocol = ATA_PROT_PIO; index = dev->multi_count ? 0 : 8; } else { tf->protocol = ATA_PROT_DMA; index = 16; } cmd = ata_rw_cmds[index + fua + lba48 + write]; if (!cmd) return false; tf->command = cmd; return true; } /** * ata_tf_read_block - Read block address from ATA taskfile * @tf: ATA taskfile of interest * @dev: ATA device @tf belongs to * * LOCKING: * None. * * Read block address from @tf. This function can handle all * three address formats - LBA, LBA48 and CHS. tf->protocol and * flags select the address format to use. * * RETURNS: * Block address read from @tf. */ u64 ata_tf_read_block(const struct ata_taskfile *tf, struct ata_device *dev) { u64 block = 0; if (tf->flags & ATA_TFLAG_LBA) { if (tf->flags & ATA_TFLAG_LBA48) { block |= (u64)tf->hob_lbah << 40; block |= (u64)tf->hob_lbam << 32; block |= (u64)tf->hob_lbal << 24; } else block |= (tf->device & 0xf) << 24; block |= tf->lbah << 16; block |= tf->lbam << 8; block |= tf->lbal; } else { u32 cyl, head, sect; cyl = tf->lbam | (tf->lbah << 8); head = tf->device & 0xf; sect = tf->lbal; if (!sect) { ata_dev_warn(dev, "device reported invalid CHS sector 0\n"); return U64_MAX; } block = (cyl * dev->heads + head) * dev->sectors + sect - 1; } return block; } /* * Set a taskfile command duration limit index. */ static inline void ata_set_tf_cdl(struct ata_queued_cmd *qc, int cdl) { struct ata_taskfile *tf = &qc->tf; if (tf->protocol == ATA_PROT_NCQ) tf->auxiliary |= cdl; else tf->feature |= cdl; /* * Mark this command as having a CDL and request the result * task file so that we can inspect the sense data available * bit on completion. */ qc->flags |= ATA_QCFLAG_HAS_CDL | ATA_QCFLAG_RESULT_TF; } /** * ata_build_rw_tf - Build ATA taskfile for given read/write request * @qc: Metadata associated with the taskfile to build * @block: Block address * @n_block: Number of blocks * @tf_flags: RW/FUA etc... * @cdl: Command duration limit index * @class: IO priority class * * LOCKING: * None. * * Build ATA taskfile for the command @qc for read/write request described * by @block, @n_block, @tf_flags and @class. * * RETURNS: * * 0 on success, -ERANGE if the request is too large for @dev, * -EINVAL if the request is invalid. */ int ata_build_rw_tf(struct ata_queued_cmd *qc, u64 block, u32 n_block, unsigned int tf_flags, int cdl, int class) { struct ata_taskfile *tf = &qc->tf; struct ata_device *dev = qc->dev; tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE; tf->flags |= tf_flags; if (ata_ncq_enabled(dev)) { /* yay, NCQ */ if (!lba_48_ok(block, n_block)) return -ERANGE; tf->protocol = ATA_PROT_NCQ; tf->flags |= ATA_TFLAG_LBA | ATA_TFLAG_LBA48; if (tf->flags & ATA_TFLAG_WRITE) tf->command = ATA_CMD_FPDMA_WRITE; else tf->command = ATA_CMD_FPDMA_READ; tf->nsect = qc->hw_tag << 3; tf->hob_feature = (n_block >> 8) & 0xff; tf->feature = n_block & 0xff; tf->hob_lbah = (block >> 40) & 0xff; tf->hob_lbam = (block >> 32) & 0xff; tf->hob_lbal = (block >> 24) & 0xff; tf->lbah = (block >> 16) & 0xff; tf->lbam = (block >> 8) & 0xff; tf->lbal = block & 0xff; tf->device = ATA_LBA; if (tf->flags & ATA_TFLAG_FUA) tf->device |= 1 << 7; if (dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLED && class == IOPRIO_CLASS_RT) tf->hob_nsect |= ATA_PRIO_HIGH << ATA_SHIFT_PRIO; if ((dev->flags & ATA_DFLAG_CDL_ENABLED) && cdl) ata_set_tf_cdl(qc, cdl); } else if (dev->flags & ATA_DFLAG_LBA) { tf->flags |= ATA_TFLAG_LBA; if ((dev->flags & ATA_DFLAG_CDL_ENABLED) && cdl) ata_set_tf_cdl(qc, cdl); /* Both FUA writes and a CDL index require 48-bit commands */ if (!(tf->flags & ATA_TFLAG_FUA) && !(qc->flags & ATA_QCFLAG_HAS_CDL) && lba_28_ok(block, n_block)) { /* use LBA28 */ tf->device |= (block >> 24) & 0xf; } else if (lba_48_ok(block, n_block)) { if (!(dev->flags & ATA_DFLAG_LBA48)) return -ERANGE; /* use LBA48 */ tf->flags |= ATA_TFLAG_LBA48; tf->hob_nsect = (n_block >> 8) & 0xff; tf->hob_lbah = (block >> 40) & 0xff; tf->hob_lbam = (block >> 32) & 0xff; tf->hob_lbal = (block >> 24) & 0xff; } else { /* request too large even for LBA48 */ return -ERANGE; } if (unlikely(!ata_set_rwcmd_protocol(dev, tf))) return -EINVAL; tf->nsect = n_block & 0xff; tf->lbah = (block >> 16) & 0xff; tf->lbam = (block >> 8) & 0xff; tf->lbal = block & 0xff; tf->device |= ATA_LBA; } else { /* CHS */ u32 sect, head, cyl, track; /* The request -may- be too large for CHS addressing. */ if (!lba_28_ok(block, n_block)) return -ERANGE; if (unlikely(!ata_set_rwcmd_protocol(dev, tf))) return -EINVAL; /* Convert LBA to CHS */ track = (u32)block / dev->sectors; cyl = track / dev->heads; head = track % dev->heads; sect = (u32)block % dev->sectors + 1; /* Check whether the converted CHS can fit. Cylinder: 0-65535 Head: 0-15 Sector: 1-255*/ if ((cyl >> 16) || (head >> 4) || (sect >> 8) || (!sect)) return -ERANGE; tf->nsect = n_block & 0xff; /* Sector count 0 means 256 sectors */ tf->lbal = sect; tf->lbam = cyl; tf->lbah = cyl >> 8; tf->device |= head; } return 0; } /** * ata_pack_xfermask - Pack pio, mwdma and udma masks into xfer_mask * @pio_mask: pio_mask * @mwdma_mask: mwdma_mask * @udma_mask: udma_mask * * Pack @pio_mask, @mwdma_mask and @udma_mask into a single * unsigned int xfer_mask. * * LOCKING: * None. * * RETURNS: * Packed xfer_mask. */ unsigned int ata_pack_xfermask(unsigned int pio_mask, unsigned int mwdma_mask, unsigned int udma_mask) { return ((pio_mask << ATA_SHIFT_PIO) & ATA_MASK_PIO) | ((mwdma_mask << ATA_SHIFT_MWDMA) & ATA_MASK_MWDMA) | ((udma_mask << ATA_SHIFT_UDMA) & ATA_MASK_UDMA); } EXPORT_SYMBOL_GPL(ata_pack_xfermask); /** * ata_unpack_xfermask - Unpack xfer_mask into pio, mwdma and udma masks * @xfer_mask: xfer_mask to unpack * @pio_mask: resulting pio_mask * @mwdma_mask: resulting mwdma_mask * @udma_mask: resulting udma_mask * * Unpack @xfer_mask into @pio_mask, @mwdma_mask and @udma_mask. * Any NULL destination masks will be ignored. */ void ata_unpack_xfermask(unsigned int xfer_mask, unsigned int *pio_mask, unsigned int *mwdma_mask, unsigned int *udma_mask) { if (pio_mask) *pio_mask = (xfer_mask & ATA_MASK_PIO) >> ATA_SHIFT_PIO; if (mwdma_mask) *mwdma_mask = (xfer_mask & ATA_MASK_MWDMA) >> ATA_SHIFT_MWDMA; if (udma_mask) *udma_mask = (xfer_mask & ATA_MASK_UDMA) >> ATA_SHIFT_UDMA; } static const struct ata_xfer_ent { int shift, bits; u8 base; } ata_xfer_tbl[] = { { ATA_SHIFT_PIO, ATA_NR_PIO_MODES, XFER_PIO_0 }, { ATA_SHIFT_MWDMA, ATA_NR_MWDMA_MODES, XFER_MW_DMA_0 }, { ATA_SHIFT_UDMA, ATA_NR_UDMA_MODES, XFER_UDMA_0 }, { -1, }, }; /** * ata_xfer_mask2mode - Find matching XFER_* for the given xfer_mask * @xfer_mask: xfer_mask of interest * * Return matching XFER_* value for @xfer_mask. Only the highest * bit of @xfer_mask is considered. * * LOCKING: * None. * * RETURNS: * Matching XFER_* value, 0xff if no match found. */ u8 ata_xfer_mask2mode(unsigned int xfer_mask) { int highbit = fls(xfer_mask) - 1; const struct ata_xfer_ent *ent; for (ent = ata_xfer_tbl; ent->shift >= 0; ent++) if (highbit >= ent->shift && highbit < ent->shift + ent->bits) return ent->base + highbit - ent->shift; return 0xff; } EXPORT_SYMBOL_GPL(ata_xfer_mask2mode); /** * ata_xfer_mode2mask - Find matching xfer_mask for XFER_* * @xfer_mode: XFER_* of interest * * Return matching xfer_mask for @xfer_mode. * * LOCKING: * None. * * RETURNS: * Matching xfer_mask, 0 if no match found. */ unsigned int ata_xfer_mode2mask(u8 xfer_mode) { const struct ata_xfer_ent *ent; for (ent = ata_xfer_tbl; ent->shift >= 0; ent++) if (xfer_mode >= ent->base && xfer_mode < ent->base + ent->bits) return ((2 << (ent->shift + xfer_mode - ent->base)) - 1) & ~((1 << ent->shift) - 1); return 0; } EXPORT_SYMBOL_GPL(ata_xfer_mode2mask); /** * ata_xfer_mode2shift - Find matching xfer_shift for XFER_* * @xfer_mode: XFER_* of interest * * Return matching xfer_shift for @xfer_mode. * * LOCKING: * None. * * RETURNS: * Matching xfer_shift, -1 if no match found. */ int ata_xfer_mode2shift(u8 xfer_mode) { const struct ata_xfer_ent *ent; for (ent = ata_xfer_tbl; ent->shift >= 0; ent++) if (xfer_mode >= ent->base && xfer_mode < ent->base + ent->bits) return ent->shift; return -1; } EXPORT_SYMBOL_GPL(ata_xfer_mode2shift); /** * ata_mode_string - convert xfer_mask to string * @xfer_mask: mask of bits supported; only highest bit counts. * * Determine string which represents the highest speed * (highest bit in @modemask). * * LOCKING: * None. * * RETURNS: * Constant C string representing highest speed listed in * @mode_mask, or the constant C string "<n/a>". */ const char *ata_mode_string(unsigned int xfer_mask) { static const char * const xfer_mode_str[] = { "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5", "PIO6", "MWDMA0", "MWDMA1", "MWDMA2", "MWDMA3", "MWDMA4", "UDMA/16", "UDMA/25", "UDMA/33", "UDMA/44", "UDMA/66", "UDMA/100", "UDMA/133", "UDMA7", }; int highbit; highbit = fls(xfer_mask) - 1; if (highbit >= 0 && highbit < ARRAY_SIZE(xfer_mode_str)) return xfer_mode_str[highbit]; return "<n/a>"; } EXPORT_SYMBOL_GPL(ata_mode_string); const char *sata_spd_string(unsigned int spd) { static const char * const spd_str[] = { "1.5 Gbps", "3.0 Gbps", "6.0 Gbps", }; if (spd == 0 || (spd - 1) >= ARRAY_SIZE(spd_str)) return "<unknown>"; return spd_str[spd - 1]; } /** * ata_dev_classify - determine device type based on ATA-spec signature * @tf: ATA taskfile register set for device to be identified * * Determine from taskfile register contents whether a device is * ATA or ATAPI, as per "Signature and persistence" section * of ATA/PI spec (volume 1, sect 5.14). * * LOCKING: * None. * * RETURNS: * Device type, %ATA_DEV_ATA, %ATA_DEV_ATAPI, %ATA_DEV_PMP, * %ATA_DEV_ZAC, or %ATA_DEV_UNKNOWN the event of failure. */ unsigned int ata_dev_classify(const struct ata_taskfile *tf) { /* Apple's open source Darwin code hints that some devices only * put a proper signature into the LBA mid/high registers, * So, we only check those. It's sufficient for uniqueness. * * ATA/ATAPI-7 (d1532v1r1: Feb. 19, 2003) specified separate * signatures for ATA and ATAPI devices attached on SerialATA, * 0x3c/0xc3 and 0x69/0x96 respectively. However, SerialATA * spec has never mentioned about using different signatures * for ATA/ATAPI devices. Then, Serial ATA II: Port * Multiplier specification began to use 0x69/0x96 to identify * port multpliers and 0x3c/0xc3 to identify SEMB device. * ATA/ATAPI-7 dropped descriptions about 0x3c/0xc3 and * 0x69/0x96 shortly and described them as reserved for * SerialATA. * * We follow the current spec and consider that 0x69/0x96 * identifies a port multiplier and 0x3c/0xc3 a SEMB device. * Unfortunately, WDC WD1600JS-62MHB5 (a hard drive) reports * SEMB signature. This is worked around in * ata_dev_read_id(). */ if (tf->lbam == 0 && tf->lbah == 0) return ATA_DEV_ATA; if (tf->lbam == 0x14 && tf->lbah == 0xeb) return ATA_DEV_ATAPI; if (tf->lbam == 0x69 && tf->lbah == 0x96) return ATA_DEV_PMP; if (tf->lbam == 0x3c && tf->lbah == 0xc3) return ATA_DEV_SEMB; if (tf->lbam == 0xcd && tf->lbah == 0xab) return ATA_DEV_ZAC; return ATA_DEV_UNKNOWN; } EXPORT_SYMBOL_GPL(ata_dev_classify); /** * ata_id_string - Convert IDENTIFY DEVICE page into string * @id: IDENTIFY DEVICE results we will examine * @s: string into which data is output * @ofs: offset into identify device page * @len: length of string to return. must be an even number. * * The strings in the IDENTIFY DEVICE page are broken up into * 16-bit chunks. Run through the string, and output each * 8-bit chunk linearly, regardless of platform. * * LOCKING: * caller. */ void ata_id_string(const u16 *id, unsigned char *s, unsigned int ofs, unsigned int len) { unsigned int c; BUG_ON(len & 1); while (len > 0) { c = id[ofs] >> 8; *s = c; s++; c = id[ofs] & 0xff; *s = c; s++; ofs++; len -= 2; } } EXPORT_SYMBOL_GPL(ata_id_string); /** * ata_id_c_string - Convert IDENTIFY DEVICE page into C string * @id: IDENTIFY DEVICE results we will examine * @s: string into which data is output * @ofs: offset into identify device page * @len: length of string to return. must be an odd number. * * This function is identical to ata_id_string except that it * trims trailing spaces and terminates the resulting string with * null. @len must be actual maximum length (even number) + 1. * * LOCKING: * caller. */ void ata_id_c_string(const u16 *id, unsigned char *s, unsigned int ofs, unsigned int len) { unsigned char *p; ata_id_string(id, s, ofs, len - 1); p = s + strnlen(s, len - 1); while (p > s && p[-1] == ' ') p--; *p = '\0'; } EXPORT_SYMBOL_GPL(ata_id_c_string); static u64 ata_id_n_sectors(const u16 *id) { if (ata_id_has_lba(id)) { if (ata_id_has_lba48(id)) return ata_id_u64(id, ATA_ID_LBA_CAPACITY_2); return ata_id_u32(id, ATA_ID_LBA_CAPACITY); } if (ata_id_current_chs_valid(id)) return (u32)id[ATA_ID_CUR_CYLS] * (u32)id[ATA_ID_CUR_HEADS] * (u32)id[ATA_ID_CUR_SECTORS]; return (u32)id[ATA_ID_CYLS] * (u32)id[ATA_ID_HEADS] * (u32)id[ATA_ID_SECTORS]; } u64 ata_tf_to_lba48(const struct ata_taskfile *tf) { u64 sectors = 0; sectors |= ((u64)(tf->hob_lbah & 0xff)) << 40; sectors |= ((u64)(tf->hob_lbam & 0xff)) << 32; sectors |= ((u64)(tf->hob_lbal & 0xff)) << 24; sectors |= (tf->lbah & 0xff) << 16; sectors |= (tf->lbam & 0xff) << 8; sectors |= (tf->lbal & 0xff); return sectors; } u64 ata_tf_to_lba(const struct ata_taskfile *tf) { u64 sectors = 0; sectors |= (tf->device & 0x0f) << 24; sectors |= (tf->lbah & 0xff) << 16; sectors |= (tf->lbam & 0xff) << 8; sectors |= (tf->lbal & 0xff); return sectors; } /** * ata_read_native_max_address - Read native max address * @dev: target device * @max_sectors: out parameter for the result native max address * * Perform an LBA48 or LBA28 native size query upon the device in * question. * * RETURNS: * 0 on success, -EACCES if command is aborted by the drive. * -EIO on other errors. */ static int ata_read_native_max_address(struct ata_device *dev, u64 *max_sectors) { unsigned int err_mask; struct ata_taskfile tf; int lba48 = ata_id_has_lba48(dev->id); ata_tf_init(dev, &tf); /* always clear all address registers */ tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR; if (lba48) { tf.command = ATA_CMD_READ_NATIVE_MAX_EXT; tf.flags |= ATA_TFLAG_LBA48; } else tf.command = ATA_CMD_READ_NATIVE_MAX; tf.protocol = ATA_PROT_NODATA; tf.device |= ATA_LBA; err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); if (err_mask) { ata_dev_warn(dev, "failed to read native max address (err_mask=0x%x)\n", err_mask); if (err_mask == AC_ERR_DEV && (tf.error & ATA_ABORTED)) return -EACCES; return -EIO; } if (lba48) *max_sectors = ata_tf_to_lba48(&tf) + 1; else *max_sectors = ata_tf_to_lba(&tf) + 1; if (dev->quirks & ATA_QUIRK_HPA_SIZE) (*max_sectors)--; return 0; } /** * ata_set_max_sectors - Set max sectors * @dev: target device * @new_sectors: new max sectors value to set for the device * * Set max sectors of @dev to @new_sectors. * * RETURNS: * 0 on success, -EACCES if command is aborted or denied (due to * previous non-volatile SET_MAX) by the drive. -EIO on other * errors. */ static int ata_set_max_sectors(struct ata_device *dev, u64 new_sectors) { unsigned int err_mask; struct ata_taskfile tf; int lba48 = ata_id_has_lba48(dev->id); new_sectors--; ata_tf_init(dev, &tf); tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR; if (lba48) { tf.command = ATA_CMD_SET_MAX_EXT; tf.flags |= ATA_TFLAG_LBA48; tf.hob_lbal = (new_sectors >> 24) & 0xff; tf.hob_lbam = (new_sectors >> 32) & 0xff; tf.hob_lbah = (new_sectors >> 40) & 0xff; } else { tf.command = ATA_CMD_SET_MAX; tf.device |= (new_sectors >> 24) & 0xf; } tf.protocol = ATA_PROT_NODATA; tf.device |= ATA_LBA; tf.lbal = (new_sectors >> 0) & 0xff; tf.lbam = (new_sectors >> 8) & 0xff; tf.lbah = (new_sectors >> 16) & 0xff; err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); if (err_mask) { ata_dev_warn(dev, "failed to set max address (err_mask=0x%x)\n", err_mask); if (err_mask == AC_ERR_DEV && (tf.error & (ATA_ABORTED | ATA_IDNF))) return -EACCES; return -EIO; } return 0; } /** * ata_hpa_resize - Resize a device with an HPA set * @dev: Device to resize * * Read the size of an LBA28 or LBA48 disk with HPA features and resize * it if required to the full size of the media. The caller must check * the drive has the HPA feature set enabled. * * RETURNS: * 0 on success, -errno on failure. */ static int ata_hpa_resize(struct ata_device *dev) { bool print_info = ata_dev_print_info(dev); bool unlock_hpa = ata_ignore_hpa || dev->flags & ATA_DFLAG_UNLOCK_HPA; u64 sectors = ata_id_n_sectors(dev->id); u64 native_sectors; int rc; /* do we need to do it? */ if ((dev->class != ATA_DEV_ATA && dev->class != ATA_DEV_ZAC) || !ata_id_has_lba(dev->id) || !ata_id_hpa_enabled(dev->id) || (dev->quirks & ATA_QUIRK_BROKEN_HPA)) return 0; /* read native max address */ rc = ata_read_native_max_address(dev, &native_sectors); if (rc) { /* If device aborted the command or HPA isn't going to * be unlocked, skip HPA resizing. */ if (rc == -EACCES || !unlock_hpa) { ata_dev_warn(dev, "HPA support seems broken, skipping HPA handling\n"); dev->quirks |= ATA_QUIRK_BROKEN_HPA; /* we can continue if device aborted the command */ if (rc == -EACCES) rc = 0; } return rc; } dev->n_native_sectors = native_sectors; /* nothing to do? */ if (native_sectors <= sectors || !unlock_hpa) { if (!print_info || native_sectors == sectors) return 0; if (native_sectors > sectors) ata_dev_info(dev, "HPA detected: current %llu, native %llu\n", (unsigned long long)sectors, (unsigned long long)native_sectors); else if (native_sectors < sectors) ata_dev_warn(dev, "native sectors (%llu) is smaller than sectors (%llu)\n", (unsigned long long)native_sectors, (unsigned long long)sectors); return 0; } /* let's unlock HPA */ rc = ata_set_max_sectors(dev, native_sectors); if (rc == -EACCES) { /* if device aborted the command, skip HPA resizing */ ata_dev_warn(dev, "device aborted resize (%llu -> %llu), skipping HPA handling\n", (unsigned long long)sectors, (unsigned long long)native_sectors); dev->quirks |= ATA_QUIRK_BROKEN_HPA; return 0; } else if (rc) return rc; /* re-read IDENTIFY data */ rc = ata_dev_reread_id(dev, 0); if (rc) { ata_dev_err(dev, "failed to re-read IDENTIFY data after HPA resizing\n"); return rc; } if (print_info) { u64 new_sectors = ata_id_n_sectors(dev->id); ata_dev_info(dev, "HPA unlocked: %llu -> %llu, native %llu\n", (unsigned long long)sectors, (unsigned long long)new_sectors, (unsigned long long)native_sectors); } return 0; } /** * ata_dump_id - IDENTIFY DEVICE info debugging output * @dev: device from which the information is fetched * @id: IDENTIFY DEVICE page to dump * * Dump selected 16-bit words from the given IDENTIFY DEVICE * page. * * LOCKING: * caller. */ static inline void ata_dump_id(struct ata_device *dev, const u16 *id) { ata_dev_dbg(dev, "49==0x%04x 53==0x%04x 63==0x%04x 64==0x%04x 75==0x%04x\n" "80==0x%04x 81==0x%04x 82==0x%04x 83==0x%04x 84==0x%04x\n" "88==0x%04x 93==0x%04x\n", id[49], id[53], id[63], id[64], id[75], id[80], id[81], id[82], id[83], id[84], id[88], id[93]); } /** * ata_id_xfermask - Compute xfermask from the given IDENTIFY data * @id: IDENTIFY data to compute xfer mask from * * Compute the xfermask for this device. This is not as trivial * as it seems if we must consider early devices correctly. * * FIXME: pre IDE drive timing (do we care ?). * * LOCKING: * None. * * RETURNS: * Computed xfermask */ unsigned int ata_id_xfermask(const u16 *id) { unsigned int pio_mask, mwdma_mask, udma_mask; /* Usual case. Word 53 indicates word 64 is valid */ if (id[ATA_ID_FIELD_VALID] & (1 << 1)) { pio_mask = id[ATA_ID_PIO_MODES] & 0x03; pio_mask <<= 3; pio_mask |= 0x7; } else { /* If word 64 isn't valid then Word 51 high byte holds * the PIO timing number for the maximum. Turn it into * a mask. */ u8 mode = (id[ATA_ID_OLD_PIO_MODES] >> 8) & 0xFF; if (mode < 5) /* Valid PIO range */ pio_mask = (2 << mode) - 1; else pio_mask = 1; /* But wait.. there's more. Design your standards by * committee and you too can get a free iordy field to * process. However it is the speeds not the modes that * are supported... Note drivers using the timing API * will get this right anyway */ } mwdma_mask = id[ATA_ID_MWDMA_MODES] & 0x07; if (ata_id_is_cfa(id)) { /* * Process compact flash extended modes */ int pio = (id[ATA_ID_CFA_MODES] >> 0) & 0x7; int dma = (id[ATA_ID_CFA_MODES] >> 3) & 0x7; if (pio) pio_mask |= (1 << 5); if (pio > 1) pio_mask |= (1 << 6); if (dma) mwdma_mask |= (1 << 3); if (dma > 1) mwdma_mask |= (1 << 4); } udma_mask = 0; if (id[ATA_ID_FIELD_VALID] & (1 << 2)) udma_mask = id[ATA_ID_UDMA_MODES] & 0xff; return ata_pack_xfermask(pio_mask, mwdma_mask, udma_mask); } EXPORT_SYMBOL_GPL(ata_id_xfermask); static void ata_qc_complete_internal(struct ata_queued_cmd *qc) { struct completion *waiting = qc->private_data; complete(waiting); } /** * ata_exec_internal - execute libata internal command * @dev: Device to which the command is sent * @tf: Taskfile registers for the command and the result * @cdb: CDB for packet command * @dma_dir: Data transfer direction of the command * @buf: Data buffer of the command * @buflen: Length of data buffer * @timeout: Timeout in msecs (0 for default) * * Executes libata internal command with timeout. @tf contains * the command on entry and the result on return. Timeout and error * conditions are reported via the return value. No recovery action * is taken after a command times out. It is the caller's duty to * clean up after timeout. * * LOCKING: * None. Should be called with kernel context, might sleep. * * RETURNS: * Zero on success, AC_ERR_* mask on failure */ unsigned int ata_exec_internal(struct ata_device *dev, struct ata_taskfile *tf, const u8 *cdb, enum dma_data_direction dma_dir, void *buf, unsigned int buflen, unsigned int timeout) { struct ata_link *link = dev->link; struct ata_port *ap = link->ap; u8 command = tf->command; struct ata_queued_cmd *qc; struct scatterlist sgl; unsigned int preempted_tag; u32 preempted_sactive; u64 preempted_qc_active; int preempted_nr_active_links; bool auto_timeout = false; DECLARE_COMPLETION_ONSTACK(wait); unsigned long flags; unsigned int err_mask; int rc; if (WARN_ON(dma_dir != DMA_NONE && !buf)) return AC_ERR_INVALID; spin_lock_irqsave(ap->lock, flags); /* No internal command while frozen */ if (ata_port_is_frozen(ap)) { spin_unlock_irqrestore(ap->lock, flags); return AC_ERR_SYSTEM; } /* Initialize internal qc */ qc = __ata_qc_from_tag(ap, ATA_TAG_INTERNAL); qc->tag = ATA_TAG_INTERNAL; qc->hw_tag = 0; qc->scsicmd = NULL; qc->ap = ap; qc->dev = dev; ata_qc_reinit(qc); preempted_tag = link->active_tag; preempted_sactive = link->sactive; preempted_qc_active = ap->qc_active; preempted_nr_active_links = ap->nr_active_links; link->active_tag = ATA_TAG_POISON; link->sactive = 0; ap->qc_active = 0; ap->nr_active_links = 0; /* Prepare and issue qc */ qc->tf = *tf; if (cdb) memcpy(qc->cdb, cdb, ATAPI_CDB_LEN); /* Some SATA bridges need us to indicate data xfer direction */ if (tf->protocol == ATAPI_PROT_DMA && (dev->flags & ATA_DFLAG_DMADIR) && dma_dir == DMA_FROM_DEVICE) qc->tf.feature |= ATAPI_DMADIR; qc->flags |= ATA_QCFLAG_RESULT_TF; qc->dma_dir = dma_dir; if (dma_dir != DMA_NONE) { sg_init_one(&sgl, buf, buflen); ata_sg_init(qc, &sgl, 1); qc->nbytes = buflen; } qc->private_data = &wait; qc->complete_fn = ata_qc_complete_internal; ata_qc_issue(qc); spin_unlock_irqrestore(ap->lock, flags); if (!timeout) { if (ata_probe_timeout) { timeout = ata_probe_timeout * 1000; } else { timeout = ata_internal_cmd_timeout(dev, command); auto_timeout = true; } } ata_eh_release(ap); rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout)); ata_eh_acquire(ap); ata_sff_flush_pio_task(ap); if (!rc) { /* * We are racing with irq here. If we lose, the following test * prevents us from completing the qc twice. If we win, the port * is frozen and will be cleaned up by ->post_internal_cmd(). */ spin_lock_irqsave(ap->lock, flags); if (qc->flags & ATA_QCFLAG_ACTIVE) { qc->err_mask |= AC_ERR_TIMEOUT; ata_port_freeze(ap); ata_dev_warn(dev, "qc timeout after %u msecs (cmd 0x%x)\n", timeout, command); } spin_unlock_irqrestore(ap->lock, flags); } if (ap->ops->post_internal_cmd) ap->ops->post_internal_cmd(qc); /* Perform minimal error analysis */ if (qc->flags & ATA_QCFLAG_EH) { if (qc->result_tf.status & (ATA_ERR | ATA_DF)) qc->err_mask |= AC_ERR_DEV; if (!qc->err_mask) qc->err_mask |= AC_ERR_OTHER; if (qc->err_mask & ~AC_ERR_OTHER) qc->err_mask &= ~AC_ERR_OTHER; } else if (qc->tf.command == ATA_CMD_REQ_SENSE_DATA) { qc->result_tf.status |= ATA_SENSE; } /* Finish up */ spin_lock_irqsave(ap->lock, flags); *tf = qc->result_tf; err_mask = qc->err_mask; ata_qc_free(qc); link->active_tag = preempted_tag; link->sactive = preempted_sactive; ap->qc_active = preempted_qc_active; ap->nr_active_links = preempted_nr_active_links; spin_unlock_irqrestore(ap->lock, flags); if ((err_mask & AC_ERR_TIMEOUT) && auto_timeout) ata_internal_cmd_timed_out(dev, command); return err_mask; } /** * ata_pio_need_iordy - check if iordy needed * @adev: ATA device * * Check if the current speed of the device requires IORDY. Used * by various controllers for chip configuration. */ unsigned int ata_pio_need_iordy(const struct ata_device *adev) { /* Don't set IORDY if we're preparing for reset. IORDY may * lead to controller lock up on certain controllers if the * port is not occupied. See bko#11703 for details. */ if (adev->link->ap->pflags & ATA_PFLAG_RESETTING) return 0; /* Controller doesn't support IORDY. Probably a pointless * check as the caller should know this. */ if (adev->link->ap->flags & ATA_FLAG_NO_IORDY) return 0; /* CF spec. r4.1 Table 22 says no iordy on PIO5 and PIO6. */ if (ata_id_is_cfa(adev->id) && (adev->pio_mode == XFER_PIO_5 || adev->pio_mode == XFER_PIO_6)) return 0; /* PIO3 and higher it is mandatory */ if (adev->pio_mode > XFER_PIO_2) return 1; /* We turn it on when possible */ if (ata_id_has_iordy(adev->id)) return 1; return 0; } EXPORT_SYMBOL_GPL(ata_pio_need_iordy); /** * ata_pio_mask_no_iordy - Return the non IORDY mask * @adev: ATA device * * Compute the highest mode possible if we are not using iordy. Return * -1 if no iordy mode is available. */ static u32 ata_pio_mask_no_iordy(const struct ata_device *adev) { /* If we have no drive specific rule, then PIO 2 is non IORDY */ if (adev->id[ATA_ID_FIELD_VALID] & 2) { /* EIDE */ u16 pio = adev->id[ATA_ID_EIDE_PIO]; /* Is the speed faster than the drive allows non IORDY ? */ if (pio) { /* This is cycle times not frequency - watch the logic! */ if (pio > 240) /* PIO2 is 240nS per cycle */ return 3 << ATA_SHIFT_PIO; return 7 << ATA_SHIFT_PIO; } } return 3 << ATA_SHIFT_PIO; } /** * ata_do_dev_read_id - default ID read method * @dev: device * @tf: proposed taskfile * @id: data buffer * * Issue the identify taskfile and hand back the buffer containing * identify data. For some RAID controllers and for pre ATA devices * this function is wrapped or replaced by the driver */ unsigned int ata_do_dev_read_id(struct ata_device *dev, struct ata_taskfile *tf, __le16 *id) { return ata_exec_internal(dev, tf, NULL, DMA_FROM_DEVICE, id, sizeof(id[0]) * ATA_ID_WORDS, 0); } EXPORT_SYMBOL_GPL(ata_do_dev_read_id); /** * ata_dev_read_id - Read ID data from the specified device * @dev: target device * @p_class: pointer to class of the target device (may be changed) * @flags: ATA_READID_* flags * @id: buffer to read IDENTIFY data into * * Read ID data from the specified device. ATA_CMD_ID_ATA is * performed on ATA devices and ATA_CMD_ID_ATAPI on ATAPI * devices. This function also issues ATA_CMD_INIT_DEV_PARAMS * for pre-ATA4 drives. * * FIXME: ATA_CMD_ID_ATA is optional for early drives and right * now we abort if we hit that case. * * LOCKING: * Kernel thread context (may sleep) * * RETURNS: * 0 on success, -errno otherwise. */ int ata_dev_read_id(struct ata_device *dev, unsigned int *p_class, unsigned int flags, u16 *id) { struct ata_port *ap = dev->link->ap; unsigned int class = *p_class; struct ata_taskfile tf; unsigned int err_mask = 0; const char *reason; bool is_semb = class == ATA_DEV_SEMB; int may_fallback = 1, tried_spinup = 0; int rc; retry: ata_tf_init(dev, &tf); switch (class) { case ATA_DEV_SEMB: class = ATA_DEV_ATA; /* some hard drives report SEMB sig */ fallthrough; case ATA_DEV_ATA: case ATA_DEV_ZAC: tf.command = ATA_CMD_ID_ATA; break; case ATA_DEV_ATAPI: tf.command = ATA_CMD_ID_ATAPI; break; default: rc = -ENODEV; reason = "unsupported class"; goto err_out; } tf.protocol = ATA_PROT_PIO; /* Some devices choke if TF registers contain garbage. Make * sure those are properly initialized. */ tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE; /* Device presence detection is unreliable on some * controllers. Always poll IDENTIFY if available. */ tf.flags |= ATA_TFLAG_POLLING; if (ap->ops->read_id) err_mask = ap->ops->read_id(dev, &tf, (__le16 *)id); else err_mask = ata_do_dev_read_id(dev, &tf, (__le16 *)id); if (err_mask) { if (err_mask & AC_ERR_NODEV_HINT) { ata_dev_dbg(dev, "NODEV after polling detection\n"); return -ENOENT; } if (is_semb) { ata_dev_info(dev, "IDENTIFY failed on device w/ SEMB sig, disabled\n"); /* SEMB is not supported yet */ *p_class = ATA_DEV_SEMB_UNSUP; return 0; } if ((err_mask == AC_ERR_DEV) && (tf.error & ATA_ABORTED)) { /* Device or controller might have reported * the wrong device class. Give a shot at the * other IDENTIFY if the current one is * aborted by the device. */ if (may_fallback) { may_fallback = 0; if (class == ATA_DEV_ATA) class = ATA_DEV_ATAPI; else class = ATA_DEV_ATA; goto retry; } /* Control reaches here iff the device aborted * both flavors of IDENTIFYs which happens * sometimes with phantom devices. */ ata_dev_dbg(dev, "both IDENTIFYs aborted, assuming NODEV\n"); return -ENOENT; } rc = -EIO; reason = "I/O error"; goto err_out; } if (dev->quirks & ATA_QUIRK_DUMP_ID) { ata_dev_info(dev, "dumping IDENTIFY data, " "class=%d may_fallback=%d tried_spinup=%d\n", class, may_fallback, tried_spinup); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_OFFSET, 16, 2, id, ATA_ID_WORDS * sizeof(*id), true); } /* Falling back doesn't make sense if ID data was read * successfully at least once. */ may_fallback = 0; swap_buf_le16(id, ATA_ID_WORDS); /* sanity check */ rc = -EINVAL; reason = "device reports invalid type"; if (class == ATA_DEV_ATA || class == ATA_DEV_ZAC) { if (!ata_id_is_ata(id) && !ata_id_is_cfa(id)) goto err_out; if (ap->host->flags & ATA_HOST_IGNORE_ATA && ata_id_is_ata(id)) { ata_dev_dbg(dev, "host indicates ignore ATA devices, ignored\n"); return -ENOENT; } } else { if (ata_id_is_ata(id)) goto err_out; } if (!tried_spinup && (id[2] == 0x37c8 || id[2] == 0x738c)) { tried_spinup = 1; /* * Drive powered-up in standby mode, and requires a specific * SET_FEATURES spin-up subcommand before it will accept * anything other than the original IDENTIFY command. */ err_mask = ata_dev_set_feature(dev, SETFEATURES_SPINUP, 0); if (err_mask && id[2] != 0x738c) { rc = -EIO; reason = "SPINUP failed"; goto err_out; } /* * If the drive initially returned incomplete IDENTIFY info, * we now must reissue the IDENTIFY command. */ if (id[2] == 0x37c8) goto retry; } if ((flags & ATA_READID_POSTRESET) && (class == ATA_DEV_ATA || class == ATA_DEV_ZAC)) { /* * The exact sequence expected by certain pre-ATA4 drives is: * SRST RESET * IDENTIFY (optional in early ATA) * INITIALIZE DEVICE PARAMETERS (later IDE and ATA) * anything else.. * Some drives were very specific about that exact sequence. * * Note that ATA4 says lba is mandatory so the second check * should never trigger. */ if (ata_id_major_version(id) < 4 || !ata_id_has_lba(id)) { err_mask = ata_dev_init_params(dev, id[3], id[6]); if (err_mask) { rc = -EIO; reason = "INIT_DEV_PARAMS failed"; goto err_out; } /* current CHS translation info (id[53-58]) might be * changed. reread the identify device info. */ flags &= ~ATA_READID_POSTRESET; goto retry; } } *p_class = class; return 0; err_out: ata_dev_warn(dev, "failed to IDENTIFY (%s, err_mask=0x%x)\n", reason, err_mask); return rc; } bool ata_dev_power_init_tf(struct ata_device *dev, struct ata_taskfile *tf, bool set_active) { /* Only applies to ATA and ZAC devices */ if (dev->class != ATA_DEV_ATA && dev->class != ATA_DEV_ZAC) return false; ata_tf_init(dev, tf); tf->flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR; tf->protocol = ATA_PROT_NODATA; if (set_active) { /* VERIFY for 1 sector at lba=0 */ tf->command = ATA_CMD_VERIFY; tf->nsect = 1; if (dev->flags & ATA_DFLAG_LBA) { tf->flags |= ATA_TFLAG_LBA; tf->device |= ATA_LBA; } else { /* CHS */ tf->lbal = 0x1; /* sect */ } } else { tf->command = ATA_CMD_STANDBYNOW1; } return true; } static bool ata_dev_power_is_active(struct ata_device *dev) { struct ata_taskfile tf; unsigned int err_mask; ata_tf_init(dev, &tf); tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR; tf.protocol = ATA_PROT_NODATA; tf.command = ATA_CMD_CHK_POWER; err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); if (err_mask) { ata_dev_err(dev, "Check power mode failed (err_mask=0x%x)\n", err_mask); /* * Assume we are in standby mode so that we always force a * spinup in ata_dev_power_set_active(). */ return false; } ata_dev_dbg(dev, "Power mode: 0x%02x\n", tf.nsect); /* Active or idle */ return tf.nsect == 0xff; } /** * ata_dev_power_set_standby - Set a device power mode to standby * @dev: target device * * Issue a STANDBY IMMEDIATE command to set a device power mode to standby. * For an HDD device, this spins down the disks. * * LOCKING: * Kernel thread context (may sleep). */ void ata_dev_power_set_standby(struct ata_device *dev) { unsigned long ap_flags = dev->link->ap->flags; struct ata_taskfile tf; unsigned int err_mask; /* If the device is already sleeping or in standby, do nothing. */ if ((dev->flags & ATA_DFLAG_SLEEPING) || !ata_dev_power_is_active(dev)) return; /* * Some odd clown BIOSes issue spindown on power off (ACPI S4 or S5) * causing some drives to spin up and down again. For these, do nothing * if we are being called on shutdown. */ if ((ap_flags & ATA_FLAG_NO_POWEROFF_SPINDOWN) && system_state == SYSTEM_POWER_OFF) return; if ((ap_flags & ATA_FLAG_NO_HIBERNATE_SPINDOWN) && system_entering_hibernation()) return; /* Issue STANDBY IMMEDIATE command only if supported by the device */ if (!ata_dev_power_init_tf(dev, &tf, false)) return; ata_dev_notice(dev, "Entering standby power mode\n"); err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); if (err_mask) ata_dev_err(dev, "STANDBY IMMEDIATE failed (err_mask=0x%x)\n", err_mask); } /** * ata_dev_power_set_active - Set a device power mode to active * @dev: target device * * Issue a VERIFY command to enter to ensure that the device is in the * active power mode. For a spun-down HDD (standby or idle power mode), * the VERIFY command will complete after the disk spins up. * * LOCKING: * Kernel thread context (may sleep). */ void ata_dev_power_set_active(struct ata_device *dev) { struct ata_taskfile tf; unsigned int err_mask; /* * Issue READ VERIFY SECTORS command for 1 sector at lba=0 only * if supported by the device. */ if (!ata_dev_power_init_tf(dev, &tf, true)) return; /* * Check the device power state & condition and force a spinup with * VERIFY command only if the drive is not already ACTIVE or IDLE. */ if (ata_dev_power_is_active(dev)) return; ata_dev_notice(dev, "Entering active power mode\n"); err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); if (err_mask) ata_dev_err(dev, "VERIFY failed (err_mask=0x%x)\n", err_mask); } /** * ata_read_log_page - read a specific log page * @dev: target device * @log: log to read * @page: page to read * @buf: buffer to store read page * @sectors: number of sectors to read * * Read log page using READ_LOG_EXT command. * * LOCKING: * Kernel thread context (may sleep). * * RETURNS: * 0 on success, AC_ERR_* mask otherwise. */ unsigned int ata_read_log_page(struct ata_device *dev, u8 log, u8 page, void *buf, unsigned int sectors) { unsigned long ap_flags = dev->link->ap->flags; struct ata_taskfile tf; unsigned int err_mask; bool dma = false; ata_dev_dbg(dev, "read log page - log 0x%x, page 0x%x\n", log, page); /* * Return error without actually issuing the command on controllers * which e.g. lockup on a read log page. */ if (ap_flags & ATA_FLAG_NO_LOG_PAGE) return AC_ERR_DEV; retry: ata_tf_init(dev, &tf); if (ata_dma_enabled(dev) && ata_id_has_read_log_dma_ext(dev->id) && !(dev->quirks & ATA_QUIRK_NO_DMA_LOG)) { tf.command = ATA_CMD_READ_LOG_DMA_EXT; tf.protocol = ATA_PROT_DMA; dma = true; } else { tf.command = ATA_CMD_READ_LOG_EXT; tf.protocol = ATA_PROT_PIO; dma = false; } tf.lbal = log; tf.lbam = page; tf.nsect = sectors; tf.hob_nsect = sectors >> 8; tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_LBA48 | ATA_TFLAG_DEVICE; err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE, buf, sectors * ATA_SECT_SIZE, 0); if (err_mask) { if (dma) { dev->quirks |= ATA_QUIRK_NO_DMA_LOG; if (!ata_port_is_frozen(dev->link->ap)) goto retry; } ata_dev_err(dev, "Read log 0x%02x page 0x%02x failed, Emask 0x%x\n", (unsigned int)log, (unsigned int)page, err_mask); } return err_mask; } static inline void ata_clear_log_directory(struct ata_device *dev) { memset(dev->gp_log_dir, 0, ATA_SECT_SIZE); } static int ata_read_log_directory(struct ata_device *dev) { u16 version; /* If the log page is already cached, do nothing. */ version = get_unaligned_le16(&dev->gp_log_dir[0]); if (version == 0x0001) return 0; if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, dev->gp_log_dir, 1)) { ata_clear_log_directory(dev); return -EIO; } version = get_unaligned_le16(&dev->gp_log_dir[0]); if (version != 0x0001) { ata_dev_err(dev, "Invalid log directory version 0x%04x\n", version); ata_clear_log_directory(dev); dev->quirks |= ATA_QUIRK_NO_LOG_DIR; return -EINVAL; } return 0; } static int ata_log_supported(struct ata_device *dev, u8 log) { if (dev->quirks & ATA_QUIRK_NO_LOG_DIR) return 0; if (ata_read_log_directory(dev)) return 0; return get_unaligned_le16(&dev->gp_log_dir[log * 2]); } static bool ata_identify_page_supported(struct ata_device *dev, u8 page) { unsigned int err, i; if (dev->quirks & ATA_QUIRK_NO_ID_DEV_LOG) return false; if (!ata_log_supported(dev, ATA_LOG_IDENTIFY_DEVICE)) { /* * IDENTIFY DEVICE data log is defined as mandatory starting * with ACS-3 (ATA version 10). Warn about the missing log * for drives which implement this ATA level or above. */ if (ata_id_major_version(dev->id) >= 10) ata_dev_warn(dev, "ATA Identify Device Log not supported\n"); dev->quirks |= ATA_QUIRK_NO_ID_DEV_LOG; return false; } /* * Read IDENTIFY DEVICE data log, page 0, to figure out if the page is * supported. */ err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, 0, dev->sector_buf, 1); if (err) return false; for (i = 0; i < dev->sector_buf[8]; i++) { if (dev->sector_buf[9 + i] == page) return true; } return false; } static int ata_do_link_spd_quirk(struct ata_device *dev) { struct ata_link *plink = ata_dev_phys_link(dev); u32 target, target_limit; if (!sata_scr_valid(plink)) return 0; if (dev->quirks & ATA_QUIRK_1_5_GBPS) target = 1; else return 0; target_limit = (1 << target) - 1; /* if already on stricter limit, no need to push further */ if (plink->sata_spd_limit <= target_limit) return 0; plink->sata_spd_limit = target_limit; /* Request another EH round by returning -EAGAIN if link is * going faster than the target speed. Forward progress is * guaranteed by setting sata_spd_limit to target_limit above. */ if (plink->sata_spd > target) { ata_dev_info(dev, "applying link speed limit quirk to %s\n", sata_spd_string(target)); return -EAGAIN; } return 0; } static inline bool ata_dev_knobble(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; if (ata_dev_quirks(dev) & ATA_QUIRK_BRIDGE_OK) return false; return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id))); } static void ata_dev_config_ncq_send_recv(struct ata_device *dev) { unsigned int err_mask; if (!ata_log_supported(dev, ATA_LOG_NCQ_SEND_RECV)) { ata_dev_warn(dev, "NCQ Send/Recv Log not supported\n"); return; } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV, 0, dev->sector_buf, 1); if (!err_mask) { u8 *cmds = dev->ncq_send_recv_cmds; dev->flags |= ATA_DFLAG_NCQ_SEND_RECV; memcpy(cmds, dev->sector_buf, ATA_LOG_NCQ_SEND_RECV_SIZE); if (dev->quirks & ATA_QUIRK_NO_NCQ_TRIM) { ata_dev_dbg(dev, "disabling queued TRIM support\n"); cmds[ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET] &= ~ATA_LOG_NCQ_SEND_RECV_DSM_TRIM; } } } static void ata_dev_config_ncq_non_data(struct ata_device *dev) { unsigned int err_mask; if (!ata_log_supported(dev, ATA_LOG_NCQ_NON_DATA)) { ata_dev_warn(dev, "NCQ Non-Data Log not supported\n"); return; } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA, 0, dev->sector_buf, 1); if (!err_mask) memcpy(dev->ncq_non_data_cmds, dev->sector_buf, ATA_LOG_NCQ_NON_DATA_SIZE); } static void ata_dev_config_ncq_prio(struct ata_device *dev) { unsigned int err_mask; if (!ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS)) return; err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SATA_SETTINGS, dev->sector_buf, 1); if (err_mask) goto not_supported; if (!(dev->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3))) goto not_supported; dev->flags |= ATA_DFLAG_NCQ_PRIO; return; not_supported: dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLED; dev->flags &= ~ATA_DFLAG_NCQ_PRIO; } static bool ata_dev_check_adapter(struct ata_device *dev, unsigned short vendor_id) { struct pci_dev *pcidev = NULL; struct device *parent_dev = NULL; for (parent_dev = dev->tdev.parent; parent_dev != NULL; parent_dev = parent_dev->parent) { if (dev_is_pci(parent_dev)) { pcidev = to_pci_dev(parent_dev); if (pcidev->vendor == vendor_id) return true; break; } } return false; } static int ata_dev_config_ncq(struct ata_device *dev, char *desc, size_t desc_sz) { struct ata_port *ap = dev->link->ap; int hdepth = 0, ddepth = ata_id_queue_depth(dev->id); unsigned int err_mask; char *aa_desc = ""; if (!ata_id_has_ncq(dev->id)) { desc[0] = '\0'; return 0; } if (!IS_ENABLED(CONFIG_SATA_HOST)) return 0; if (dev->quirks & ATA_QUIRK_NONCQ) { snprintf(desc, desc_sz, "NCQ (not used)"); return 0; } if (dev->quirks & ATA_QUIRK_NO_NCQ_ON_ATI && ata_dev_check_adapter(dev, PCI_VENDOR_ID_ATI)) { snprintf(desc, desc_sz, "NCQ (not used)"); return 0; } if (ap->flags & ATA_FLAG_NCQ) { hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE); dev->flags |= ATA_DFLAG_NCQ; } if (!(dev->quirks & ATA_QUIRK_BROKEN_FPDMA_AA) && (ap->flags & ATA_FLAG_FPDMA_AA) && ata_id_has_fpdma_aa(dev->id)) { err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE, SATA_FPDMA_AA); if (err_mask) { ata_dev_err(dev, "failed to enable AA (error_mask=0x%x)\n", err_mask); if (err_mask != AC_ERR_DEV) { dev->quirks |= ATA_QUIRK_BROKEN_FPDMA_AA; return -EIO; } } else aa_desc = ", AA"; } if (hdepth >= ddepth) snprintf(desc, desc_sz, "NCQ (depth %d)%s", ddepth, aa_desc); else snprintf(desc, desc_sz, "NCQ (depth %d/%d)%s", hdepth, ddepth, aa_desc); if ((ap->flags & ATA_FLAG_FPDMA_AUX)) { if (ata_id_has_ncq_send_and_recv(dev->id)) ata_dev_config_ncq_send_recv(dev); if (ata_id_has_ncq_non_data(dev->id)) ata_dev_config_ncq_non_data(dev); if (ata_id_has_ncq_prio(dev->id)) ata_dev_config_ncq_prio(dev); } return 0; } static void ata_dev_config_sense_reporting(struct ata_device *dev) { unsigned int err_mask; if (!ata_id_has_sense_reporting(dev->id)) return; if (ata_id_sense_reporting_enabled(dev->id)) return; err_mask = ata_dev_set_feature(dev, SETFEATURE_SENSE_DATA, 0x1); if (err_mask) { ata_dev_dbg(dev, "failed to enable Sense Data Reporting, Emask 0x%x\n", err_mask); } } static void ata_dev_config_zac(struct ata_device *dev) { unsigned int err_mask; u8 *identify_buf = dev->sector_buf; dev->zac_zones_optimal_open = U32_MAX; dev->zac_zones_optimal_nonseq = U32_MAX; dev->zac_zones_max_open = U32_MAX; if (!ata_dev_is_zac(dev)) return; if (!ata_identify_page_supported(dev, ATA_LOG_ZONED_INFORMATION)) { ata_dev_warn(dev, "ATA Zoned Information Log not supported\n"); return; } /* * Read IDENTIFY DEVICE data log, page 9 (Zoned-device information) */ err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_ZONED_INFORMATION, identify_buf, 1); if (!err_mask) { u64 zoned_cap, opt_open, opt_nonseq, max_open; zoned_cap = get_unaligned_le64(&identify_buf[8]); if ((zoned_cap >> 63)) dev->zac_zoned_cap = (zoned_cap & 1); opt_open = get_unaligned_le64(&identify_buf[24]); if ((opt_open >> 63)) dev->zac_zones_optimal_open = (u32)opt_open; opt_nonseq = get_unaligned_le64(&identify_buf[32]); if ((opt_nonseq >> 63)) dev->zac_zones_optimal_nonseq = (u32)opt_nonseq; max_open = get_unaligned_le64(&identify_buf[40]); if ((max_open >> 63)) dev->zac_zones_max_open = (u32)max_open; } } static void ata_dev_config_trusted(struct ata_device *dev) { u64 trusted_cap; unsigned int err; if (!ata_id_has_trusted(dev->id)) return; if (!ata_identify_page_supported(dev, ATA_LOG_SECURITY)) { ata_dev_warn(dev, "Security Log not supported\n"); return; } err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SECURITY, dev->sector_buf, 1); if (err) return; trusted_cap = get_unaligned_le64(&dev->sector_buf[40]); if (!(trusted_cap & (1ULL << 63))) { ata_dev_dbg(dev, "Trusted Computing capability qword not valid!\n"); return; } if (trusted_cap & (1 << 0)) dev->flags |= ATA_DFLAG_TRUSTED; } static void ata_dev_cleanup_cdl_resources(struct ata_device *dev) { kfree(dev->cdl); dev->cdl = NULL; } static int ata_dev_init_cdl_resources(struct ata_device *dev) { struct ata_cdl *cdl = dev->cdl; unsigned int err_mask; if (!cdl) { cdl = kzalloc(sizeof(*cdl), GFP_KERNEL); if (!cdl) return -ENOMEM; dev->cdl = cdl; } err_mask = ata_read_log_page(dev, ATA_LOG_CDL, 0, cdl->desc_log_buf, ATA_LOG_CDL_SIZE / ATA_SECT_SIZE); if (err_mask) { ata_dev_warn(dev, "Read Command Duration Limits log failed\n"); ata_dev_cleanup_cdl_resources(dev); return -EIO; } return 0; } static void ata_dev_config_cdl(struct ata_device *dev) { unsigned int err_mask; bool cdl_enabled; u64 val; int ret; if (ata_id_major_version(dev->id) < 11) goto not_supported; if (!ata_log_supported(dev, ATA_LOG_IDENTIFY_DEVICE) || !ata_identify_page_supported(dev, ATA_LOG_SUPPORTED_CAPABILITIES) || !ata_identify_page_supported(dev, ATA_LOG_CURRENT_SETTINGS)) goto not_supported; err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SUPPORTED_CAPABILITIES, dev->sector_buf, 1); if (err_mask) goto not_supported; /* Check Command Duration Limit Supported bits */ val = get_unaligned_le64(&dev->sector_buf[168]); if (!(val & BIT_ULL(63)) || !(val & BIT_ULL(0))) goto not_supported; /* Warn the user if command duration guideline is not supported */ if (!(val & BIT_ULL(1))) ata_dev_warn(dev, "Command duration guideline is not supported\n"); /* * We must have support for the sense data for successful NCQ commands * log indicated by the successful NCQ command sense data supported bit. */ val = get_unaligned_le64(&dev->sector_buf[8]); if (!(val & BIT_ULL(63)) || !(val & BIT_ULL(47))) { ata_dev_warn(dev, "CDL supported but Successful NCQ Command Sense Data is not supported\n"); goto not_supported; } /* Without NCQ autosense, the successful NCQ commands log is useless. */ if (!ata_id_has_ncq_autosense(dev->id)) { ata_dev_warn(dev, "CDL supported but NCQ autosense is not supported\n"); goto not_supported; } /* * If CDL is marked as enabled, make sure the feature is enabled too. * Conversely, if CDL is disabled, make sure the feature is turned off. */ err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_CURRENT_SETTINGS, dev->sector_buf, 1); if (err_mask) goto not_supported; val = get_unaligned_le64(&dev->sector_buf[8]); cdl_enabled = val & BIT_ULL(63) && val & BIT_ULL(21); if (dev->flags & ATA_DFLAG_CDL_ENABLED) { if (!cdl_enabled) { /* Enable CDL on the device */ err_mask = ata_dev_set_feature(dev, SETFEATURES_CDL, 1); if (err_mask) { ata_dev_err(dev, "Enable CDL feature failed\n"); goto not_supported; } } } else { if (cdl_enabled) { /* Disable CDL on the device */ err_mask = ata_dev_set_feature(dev, SETFEATURES_CDL, 0); if (err_mask) { ata_dev_err(dev, "Disable CDL feature failed\n"); goto not_supported; } } } /* * While CDL itself has to be enabled using sysfs, CDL requires that * sense data for successful NCQ commands is enabled to work properly. * Just like ata_dev_config_sense_reporting(), enable it unconditionally * if supported. */ if (!(val & BIT_ULL(63)) || !(val & BIT_ULL(18))) { err_mask = ata_dev_set_feature(dev, SETFEATURE_SENSE_DATA_SUCC_NCQ, 0x1); if (err_mask) { ata_dev_warn(dev, "failed to enable Sense Data for successful NCQ commands, Emask 0x%x\n", err_mask); goto not_supported; } } /* CDL is supported: allocate and initialize needed resources. */ ret = ata_dev_init_cdl_resources(dev); if (ret) { ata_dev_warn(dev, "Initialize CDL resources failed\n"); goto not_supported; } dev->flags |= ATA_DFLAG_CDL; return; not_supported: dev->flags &= ~(ATA_DFLAG_CDL | ATA_DFLAG_CDL_ENABLED); ata_dev_cleanup_cdl_resources(dev); } static int ata_dev_config_lba(struct ata_device *dev) { const u16 *id = dev->id; const char *lba_desc; char ncq_desc[32]; int ret; dev->flags |= ATA_DFLAG_LBA; if (ata_id_has_lba48(id)) { lba_desc = "LBA48"; dev->flags |= ATA_DFLAG_LBA48; if (dev->n_sectors >= (1UL << 28) && ata_id_has_flush_ext(id)) dev->flags |= ATA_DFLAG_FLUSH_EXT; } else { lba_desc = "LBA"; } /* config NCQ */ ret = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc)); /* print device info to dmesg */ if (ata_dev_print_info(dev)) ata_dev_info(dev, "%llu sectors, multi %u: %s %s\n", (unsigned long long)dev->n_sectors, dev->multi_count, lba_desc, ncq_desc); return ret; } static void ata_dev_config_chs(struct ata_device *dev) { const u16 *id = dev->id; if (ata_id_current_chs_valid(id)) { /* Current CHS translation is valid. */ dev->cylinders = id[54]; dev->heads = id[55]; dev->sectors = id[56]; } else { /* Default translation */ dev->cylinders = id[1]; dev->heads = id[3]; dev->sectors = id[6]; } /* print device info to dmesg */ if (ata_dev_print_info(dev)) ata_dev_info(dev, "%llu sectors, multi %u, CHS %u/%u/%u\n", (unsigned long long)dev->n_sectors, dev->multi_count, dev->cylinders, dev->heads, dev->sectors); } static void ata_dev_config_fua(struct ata_device *dev) { /* Ignore FUA support if its use is disabled globally */ if (!libata_fua) goto nofua; /* Ignore devices without support for WRITE DMA FUA EXT */ if (!(dev->flags & ATA_DFLAG_LBA48) || !ata_id_has_fua(dev->id)) goto nofua; /* Ignore known bad devices and devices that lack NCQ support */ if (!ata_ncq_supported(dev) || (dev->quirks & ATA_QUIRK_NO_FUA)) goto nofua; dev->flags |= ATA_DFLAG_FUA; return; nofua: dev->flags &= ~ATA_DFLAG_FUA; } static void ata_dev_config_devslp(struct ata_device *dev) { u8 *sata_setting = dev->sector_buf; unsigned int err_mask; int i, j; /* * Check device sleep capability. Get DevSlp timing variables * from SATA Settings page of Identify Device Data Log. */ if (!ata_id_has_devslp(dev->id) || !ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS)) return; err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SATA_SETTINGS, sata_setting, 1); if (err_mask) return; dev->flags |= ATA_DFLAG_DEVSLP; for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) { j = ATA_LOG_DEVSLP_OFFSET + i; dev->devslp_timing[i] = sata_setting[j]; } } static void ata_dev_config_cpr(struct ata_device *dev) { unsigned int err_mask; size_t buf_len; int i, nr_cpr = 0; struct ata_cpr_log *cpr_log = NULL; u8 *desc, *buf = NULL; if (ata_id_major_version(dev->id) < 11) goto out; buf_len = ata_log_supported(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES); if (buf_len == 0) goto out; /* * Read the concurrent positioning ranges log (0x47). We can have at * most 255 32B range descriptors plus a 64B header. This log varies in * size, so use the size reported in the GPL directory. Reading beyond * the supported length will result in an error. */ buf_len <<= 9; buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) goto out; err_mask = ata_read_log_page(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES, 0, buf, buf_len >> 9); if (err_mask) goto out; nr_cpr = buf[0]; if (!nr_cpr) goto out; cpr_log = kzalloc(struct_size(cpr_log, cpr, nr_cpr), GFP_KERNEL); if (!cpr_log) goto out; cpr_log->nr_cpr = nr_cpr; desc = &buf[64]; for (i = 0; i < nr_cpr; i++, desc += 32) { cpr_log->cpr[i].num = desc[0]; cpr_log->cpr[i].num_storage_elements = desc[1]; cpr_log->cpr[i].start_lba = get_unaligned_le64(&desc[8]); cpr_log->cpr[i].num_lbas = get_unaligned_le64(&desc[16]); } out: swap(dev->cpr_log, cpr_log); kfree(cpr_log); kfree(buf); } /* * Configure features related to link power management. */ static void ata_dev_config_lpm(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; unsigned int err_mask; if (ap->flags & ATA_FLAG_NO_LPM) { /* * When the port does not support LPM, we cannot support it on * the device either. */ dev->quirks |= ATA_QUIRK_NOLPM; } else { /* * Some WD SATA-1 drives have issues with LPM, turn on NOLPM for * them. */ if ((dev->quirks & ATA_QUIRK_WD_BROKEN_LPM) && (dev->id[ATA_ID_SATA_CAPABILITY] & 0xe) == 0x2) dev->quirks |= ATA_QUIRK_NOLPM; /* ATI specific quirk */ if ((dev->quirks & ATA_QUIRK_NO_LPM_ON_ATI) && ata_dev_check_adapter(dev, PCI_VENDOR_ID_ATI)) dev->quirks |= ATA_QUIRK_NOLPM; } if (dev->quirks & ATA_QUIRK_NOLPM && ap->target_lpm_policy != ATA_LPM_MAX_POWER) { ata_dev_warn(dev, "LPM support broken, forcing max_power\n"); ap->target_lpm_policy = ATA_LPM_MAX_POWER; } /* * Device Initiated Power Management (DIPM) is normally disabled by * default on a device. However, DIPM may have been enabled and that * setting kept even after COMRESET because of the Software Settings * Preservation feature. So if the port does not support DIPM and the * device does, disable DIPM on the device. */ if (ap->flags & ATA_FLAG_NO_DIPM && ata_id_has_dipm(dev->id)) { err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_DISABLE, SATA_DIPM); if (err_mask && err_mask != AC_ERR_DEV) ata_dev_err(dev, "Disable DIPM failed, Emask 0x%x\n", err_mask); } } static void ata_dev_print_features(struct ata_device *dev) { if (!(dev->flags & ATA_DFLAG_FEATURES_MASK)) return; ata_dev_info(dev, "Features:%s%s%s%s%s%s%s%s%s%s\n", dev->flags & ATA_DFLAG_FUA ? " FUA" : "", dev->flags & ATA_DFLAG_TRUSTED ? " Trust" : "", dev->flags & ATA_DFLAG_DA ? " Dev-Attention" : "", dev->flags & ATA_DFLAG_DEVSLP ? " Dev-Sleep" : "", ata_id_has_hipm(dev->id) ? " HIPM" : "", ata_id_has_dipm(dev->id) ? " DIPM" : "", dev->flags & ATA_DFLAG_NCQ_SEND_RECV ? " NCQ-sndrcv" : "", dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : "", dev->flags & ATA_DFLAG_CDL ? " CDL" : "", dev->cpr_log ? " CPR" : ""); } /** * ata_dev_configure - Configure the specified ATA/ATAPI device * @dev: Target device to configure * * Configure @dev according to @dev->id. Generic and low-level * driver specific fixups are also applied. * * LOCKING: * Kernel thread context (may sleep) * * RETURNS: * 0 on success, -errno otherwise */ int ata_dev_configure(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; bool print_info = ata_dev_print_info(dev); const u16 *id = dev->id; unsigned int xfer_mask; unsigned int err_mask; char revbuf[7]; /* XYZ-99\0 */ char fwrevbuf[ATA_ID_FW_REV_LEN+1]; char modelbuf[ATA_ID_PROD_LEN+1]; int rc; if (!ata_dev_enabled(dev)) { ata_dev_dbg(dev, "no device\n"); return 0; } /* Clear the general purpose log directory cache. */ ata_clear_log_directory(dev); /* Set quirks */ dev->quirks |= ata_dev_quirks(dev); ata_force_quirks(dev); if (dev->quirks & ATA_QUIRK_DISABLE) { ata_dev_info(dev, "unsupported device, disabling\n"); ata_dev_disable(dev); return 0; } if ((!atapi_enabled || (ap->flags & ATA_FLAG_NO_ATAPI)) && dev->class == ATA_DEV_ATAPI) { ata_dev_warn(dev, "WARNING: ATAPI is %s, device ignored\n", atapi_enabled ? "not supported with this driver" : "disabled"); ata_dev_disable(dev); return 0; } rc = ata_do_link_spd_quirk(dev); if (rc) return rc; /* let ACPI work its magic */ rc = ata_acpi_on_devcfg(dev); if (rc) return rc; /* massage HPA, do it early as it might change IDENTIFY data */ rc = ata_hpa_resize(dev); if (rc) return rc; /* print device capabilities */ ata_dev_dbg(dev, "%s: cfg 49:%04x 82:%04x 83:%04x 84:%04x " "85:%04x 86:%04x 87:%04x 88:%04x\n", __func__, id[49], id[82], id[83], id[84], id[85], id[86], id[87], id[88]); /* initialize to-be-configured parameters */ dev->flags &= ~ATA_DFLAG_CFG_MASK; dev->max_sectors = 0; dev->cdb_len = 0; dev->n_sectors = 0; dev->cylinders = 0; dev->heads = 0; dev->sectors = 0; dev->multi_count = 0; /* * common ATA, ATAPI feature tests */ /* find max transfer mode; for printk only */ xfer_mask = ata_id_xfermask(id); ata_dump_id(dev, id); /* SCSI only uses 4-char revisions, dump full 8 chars from ATA */ ata_id_c_string(dev->id, fwrevbuf, ATA_ID_FW_REV, sizeof(fwrevbuf)); ata_id_c_string(dev->id, modelbuf, ATA_ID_PROD, sizeof(modelbuf)); /* ATA-specific feature tests */ if (dev->class == ATA_DEV_ATA || dev->class == ATA_DEV_ZAC) { if (ata_id_is_cfa(id)) { /* CPRM may make this media unusable */ if (id[ATA_ID_CFA_KEY_MGMT] & 1) ata_dev_warn(dev, "supports DRM functions and may not be fully accessible\n"); snprintf(revbuf, 7, "CFA"); } else { snprintf(revbuf, 7, "ATA-%d", ata_id_major_version(id)); /* Warn the user if the device has TPM extensions */ if (ata_id_has_tpm(id)) ata_dev_warn(dev, "supports DRM functions and may not be fully accessible\n"); } dev->n_sectors = ata_id_n_sectors(id); /* get current R/W Multiple count setting */ if ((dev->id[47] >> 8) == 0x80 && (dev->id[59] & 0x100)) { unsigned int max = dev->id[47] & 0xff; unsigned int cnt = dev->id[59] & 0xff; /* only recognize/allow powers of two here */ if (is_power_of_2(max) && is_power_of_2(cnt)) if (cnt <= max) dev->multi_count = cnt; } /* print device info to dmesg */ if (print_info) ata_dev_info(dev, "%s: %s, %s, max %s\n", revbuf, modelbuf, fwrevbuf, ata_mode_string(xfer_mask)); if (ata_id_has_lba(id)) { rc = ata_dev_config_lba(dev); if (rc) return rc; } else { ata_dev_config_chs(dev); } ata_dev_config_lpm(dev); ata_dev_config_fua(dev); ata_dev_config_devslp(dev); ata_dev_config_sense_reporting(dev); ata_dev_config_zac(dev); ata_dev_config_trusted(dev); ata_dev_config_cpr(dev); ata_dev_config_cdl(dev); dev->cdb_len = 32; if (print_info) ata_dev_print_features(dev); } /* ATAPI-specific feature tests */ else if (dev->class == ATA_DEV_ATAPI) { const char *cdb_intr_string = ""; const char *atapi_an_string = ""; const char *dma_dir_string = ""; u32 sntf; rc = atapi_cdb_len(id); if ((rc < 12) || (rc > ATAPI_CDB_LEN)) { ata_dev_warn(dev, "unsupported CDB len %d\n", rc); rc = -EINVAL; goto err_out_nosup; } dev->cdb_len = (unsigned int) rc; /* Enable ATAPI AN if both the host and device have * the support. If PMP is attached, SNTF is required * to enable ATAPI AN to discern between PHY status * changed notifications and ATAPI ANs. */ if (atapi_an && (ap->flags & ATA_FLAG_AN) && ata_id_has_atapi_AN(id) && (!sata_pmp_attached(ap) || sata_scr_read(&ap->link, SCR_NOTIFICATION, &sntf) == 0)) { /* issue SET feature command to turn this on */ err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE, SATA_AN); if (err_mask) ata_dev_err(dev, "failed to enable ATAPI AN (err_mask=0x%x)\n", err_mask); else { dev->flags |= ATA_DFLAG_AN; atapi_an_string = ", ATAPI AN"; } } if (ata_id_cdb_intr(dev->id)) { dev->flags |= ATA_DFLAG_CDB_INTR; cdb_intr_string = ", CDB intr"; } if (atapi_dmadir || (dev->quirks & ATA_QUIRK_ATAPI_DMADIR) || atapi_id_dmadir(dev->id)) { dev->flags |= ATA_DFLAG_DMADIR; dma_dir_string = ", DMADIR"; } if (ata_id_has_da(dev->id)) { dev->flags |= ATA_DFLAG_DA; zpodd_init(dev); } /* print device info to dmesg */ if (print_info) ata_dev_info(dev, "ATAPI: %s, %s, max %s%s%s%s\n", modelbuf, fwrevbuf, ata_mode_string(xfer_mask), cdb_intr_string, atapi_an_string, dma_dir_string); } /* determine max_sectors */ dev->max_sectors = ATA_MAX_SECTORS; if (dev->flags & ATA_DFLAG_LBA48) dev->max_sectors = ATA_MAX_SECTORS_LBA48; /* Limit PATA drive on SATA cable bridge transfers to udma5, 200 sectors */ if (ata_dev_knobble(dev)) { if (print_info) ata_dev_info(dev, "applying bridge limits\n"); dev->udma_mask &= ATA_UDMA5; dev->max_sectors = ATA_MAX_SECTORS; } if ((dev->class == ATA_DEV_ATAPI) && (atapi_command_packet_set(id) == TYPE_TAPE)) { dev->max_sectors = ATA_MAX_SECTORS_TAPE; dev->quirks |= ATA_QUIRK_STUCK_ERR; } if (dev->quirks & ATA_QUIRK_MAX_SEC_128) dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128, dev->max_sectors); if (dev->quirks & ATA_QUIRK_MAX_SEC_1024) dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_1024, dev->max_sectors); if (dev->quirks & ATA_QUIRK_MAX_SEC_LBA48) dev->max_sectors = ATA_MAX_SECTORS_LBA48; if (ap->ops->dev_config) ap->ops->dev_config(dev); if (dev->quirks & ATA_QUIRK_DIAGNOSTIC) { /* Let the user know. We don't want to disallow opens for rescue purposes, or in case the vendor is just a blithering idiot. Do this after the dev_config call as some controllers with buggy firmware may want to avoid reporting false device bugs */ if (print_info) { ata_dev_warn(dev, "Drive reports diagnostics failure. This may indicate a drive\n"); ata_dev_warn(dev, "fault or invalid emulation. Contact drive vendor for information.\n"); } } if ((dev->quirks & ATA_QUIRK_FIRMWARE_WARN) && print_info) { ata_dev_warn(dev, "WARNING: device requires firmware update to be fully functional\n"); ata_dev_warn(dev, " contact the vendor or visit http://ata.wiki.kernel.org\n"); } return 0; err_out_nosup: return rc; } /** * ata_cable_40wire - return 40 wire cable type * @ap: port * * Helper method for drivers which want to hardwire 40 wire cable * detection. */ int ata_cable_40wire(struct ata_port *ap) { return ATA_CBL_PATA40; } EXPORT_SYMBOL_GPL(ata_cable_40wire); /** * ata_cable_80wire - return 80 wire cable type * @ap: port * * Helper method for drivers which want to hardwire 80 wire cable * detection. */ int ata_cable_80wire(struct ata_port *ap) { return ATA_CBL_PATA80; } EXPORT_SYMBOL_GPL(ata_cable_80wire); /** * ata_cable_unknown - return unknown PATA cable. * @ap: port * * Helper method for drivers which have no PATA cable detection. */ int ata_cable_unknown(struct ata_port *ap) { return ATA_CBL_PATA_UNK; } EXPORT_SYMBOL_GPL(ata_cable_unknown); /** * ata_cable_ignore - return ignored PATA cable. * @ap: port * * Helper method for drivers which don't use cable type to limit * transfer mode. */ int ata_cable_ignore(struct ata_port *ap) { return ATA_CBL_PATA_IGN; } EXPORT_SYMBOL_GPL(ata_cable_ignore); /** * ata_cable_sata - return SATA cable type * @ap: port * * Helper method for drivers which have SATA cables */ int ata_cable_sata(struct ata_port *ap) { return ATA_CBL_SATA; } EXPORT_SYMBOL_GPL(ata_cable_sata); /** * sata_print_link_status - Print SATA link status * @link: SATA link to printk link status about * * This function prints link speed and status of a SATA link. * * LOCKING: * None. */ static void sata_print_link_status(struct ata_link *link) { u32 sstatus, scontrol, tmp; if (sata_scr_read(link, SCR_STATUS, &sstatus)) return; if (sata_scr_read(link, SCR_CONTROL, &scontrol)) return; if (ata_phys_link_online(link)) { tmp = (sstatus >> 4) & 0xf; ata_link_info(link, "SATA link up %s (SStatus %X SControl %X)\n", sata_spd_string(tmp), sstatus, scontrol); } else { ata_link_info(link, "SATA link down (SStatus %X SControl %X)\n", sstatus, scontrol); } } /** * ata_dev_pair - return other device on cable * @adev: device * * Obtain the other device on the same cable, or if none is * present NULL is returned */ struct ata_device *ata_dev_pair(struct ata_device *adev) { struct ata_link *link = adev->link; struct ata_device *pair = &link->device[1 - adev->devno]; if (!ata_dev_enabled(pair)) return NULL; return pair; } EXPORT_SYMBOL_GPL(ata_dev_pair); #ifdef CONFIG_ATA_ACPI /** * ata_timing_cycle2mode - find xfer mode for the specified cycle duration * @xfer_shift: ATA_SHIFT_* value for transfer type to examine. * @cycle: cycle duration in ns * * Return matching xfer mode for @cycle. The returned mode is of * the transfer type specified by @xfer_shift. If @cycle is too * slow for @xfer_shift, 0xff is returned. If @cycle is faster * than the fastest known mode, the fasted mode is returned. * * LOCKING: * None. * * RETURNS: * Matching xfer_mode, 0xff if no match found. */ u8 ata_timing_cycle2mode(unsigned int xfer_shift, int cycle) { u8 base_mode = 0xff, last_mode = 0xff; const struct ata_xfer_ent *ent; const struct ata_timing *t; for (ent = ata_xfer_tbl; ent->shift >= 0; ent++) if (ent->shift == xfer_shift) base_mode = ent->base; for (t = ata_timing_find_mode(base_mode); t && ata_xfer_mode2shift(t->mode) == xfer_shift; t++) { unsigned short this_cycle; switch (xfer_shift) { case ATA_SHIFT_PIO: case ATA_SHIFT_MWDMA: this_cycle = t->cycle; break; case ATA_SHIFT_UDMA: this_cycle = t->udma; break; default: return 0xff; } if (cycle > this_cycle) break; last_mode = t->mode; } return last_mode; } #endif /** * ata_down_xfermask_limit - adjust dev xfer masks downward * @dev: Device to adjust xfer masks * @sel: ATA_DNXFER_* selector * * Adjust xfer masks of @dev downward. Note that this function * does not apply the change. Invoking ata_set_mode() afterwards * will apply the limit. * * LOCKING: * Inherited from caller. * * RETURNS: * 0 on success, negative errno on failure */ int ata_down_xfermask_limit(struct ata_device *dev, unsigned int sel) { char buf[32]; unsigned int orig_mask, xfer_mask; unsigned int pio_mask, mwdma_mask, udma_mask; int quiet, highbit; quiet = !!(sel & ATA_DNXFER_QUIET); sel &= ~ATA_DNXFER_QUIET; xfer_mask = orig_mask = ata_pack_xfermask(dev->pio_mask, dev->mwdma_mask, dev->udma_mask); ata_unpack_xfermask(xfer_mask, &pio_mask, &mwdma_mask, &udma_mask); switch (sel) { case ATA_DNXFER_PIO: highbit = fls(pio_mask) - 1; pio_mask &= ~(1 << highbit); break; case ATA_DNXFER_DMA: if (udma_mask) { highbit = fls(udma_mask) - 1; udma_mask &= ~(1 << highbit); if (!udma_mask) return -ENOENT; } else if (mwdma_mask) { highbit = fls(mwdma_mask) - 1; mwdma_mask &= ~(1 << highbit); if (!mwdma_mask) return -ENOENT; } break; case ATA_DNXFER_40C: udma_mask &= ATA_UDMA_MASK_40C; break; case ATA_DNXFER_FORCE_PIO0: pio_mask &= 1; fallthrough; case ATA_DNXFER_FORCE_PIO: mwdma_mask = 0; udma_mask = 0; break; default: BUG(); } xfer_mask &= ata_pack_xfermask(pio_mask, mwdma_mask, udma_mask); if (!(xfer_mask & ATA_MASK_PIO) || xfer_mask == orig_mask) return -ENOENT; if (!quiet) { if (xfer_mask & (ATA_MASK_MWDMA | ATA_MASK_UDMA)) snprintf(buf, sizeof(buf), "%s:%s", ata_mode_string(xfer_mask), ata_mode_string(xfer_mask & ATA_MASK_PIO)); else snprintf(buf, sizeof(buf), "%s", ata_mode_string(xfer_mask)); ata_dev_warn(dev, "limiting speed to %s\n", buf); } ata_unpack_xfermask(xfer_mask, &dev->pio_mask, &dev->mwdma_mask, &dev->udma_mask); return 0; } static int ata_dev_set_mode(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; struct ata_eh_context *ehc = &dev->link->eh_context; const bool nosetxfer = dev->quirks & ATA_QUIRK_NOSETXFER; const char *dev_err_whine = ""; int ign_dev_err = 0; unsigned int err_mask = 0; int rc; dev->flags &= ~ATA_DFLAG_PIO; if (dev->xfer_shift == ATA_SHIFT_PIO) dev->flags |= ATA_DFLAG_PIO; if (nosetxfer && ap->flags & ATA_FLAG_SATA && ata_id_is_sata(dev->id)) dev_err_whine = " (SET_XFERMODE skipped)"; else { if (nosetxfer) ata_dev_warn(dev, "NOSETXFER but PATA detected - can't " "skip SETXFER, might malfunction\n"); err_mask = ata_dev_set_xfermode(dev); } if (err_mask & ~AC_ERR_DEV) goto fail; /* revalidate */ ehc->i.flags |= ATA_EHI_POST_SETMODE; rc = ata_dev_revalidate(dev, ATA_DEV_UNKNOWN, 0); ehc->i.flags &= ~ATA_EHI_POST_SETMODE; if (rc) return rc; if (dev->xfer_shift == ATA_SHIFT_PIO) { /* Old CFA may refuse this command, which is just fine */ if (ata_id_is_cfa(dev->id)) ign_dev_err = 1; /* Catch several broken garbage emulations plus some pre ATA devices */ if (ata_id_major_version(dev->id) == 0 && dev->pio_mode <= XFER_PIO_2) ign_dev_err = 1; /* Some very old devices and some bad newer ones fail any kind of SET_XFERMODE request but support PIO0-2 timings and no IORDY */ if (!ata_id_has_iordy(dev->id) && dev->pio_mode <= XFER_PIO_2) ign_dev_err = 1; } /* Early MWDMA devices do DMA but don't allow DMA mode setting. Don't fail an MWDMA0 set IFF the device indicates it is in MWDMA0 */ if (dev->xfer_shift == ATA_SHIFT_MWDMA && dev->dma_mode == XFER_MW_DMA_0 && (dev->id[63] >> 8) & 1) ign_dev_err = 1; /* if the device is actually configured correctly, ignore dev err */ if (dev->xfer_mode == ata_xfer_mask2mode(ata_id_xfermask(dev->id))) ign_dev_err = 1; if (err_mask & AC_ERR_DEV) { if (!ign_dev_err) goto fail; else dev_err_whine = " (device error ignored)"; } ata_dev_dbg(dev, "xfer_shift=%u, xfer_mode=0x%x\n", dev->xfer_shift, (int)dev->xfer_mode); if (!(ehc->i.flags & ATA_EHI_QUIET) || ehc->i.flags & ATA_EHI_DID_HARDRESET) ata_dev_info(dev, "configured for %s%s\n", ata_mode_string(ata_xfer_mode2mask(dev->xfer_mode)), dev_err_whine); return 0; fail: ata_dev_err(dev, "failed to set xfermode (err_mask=0x%x)\n", err_mask); return -EIO; } /** * ata_set_mode - Program timings and issue SET FEATURES - XFER * @link: link on which timings will be programmed * @r_failed_dev: out parameter for failed device * * Standard implementation of the function used to tune and set * ATA device disk transfer mode (PIO3, UDMA6, etc.). If * ata_dev_set_mode() fails, pointer to the failing device is * returned in @r_failed_dev. * * LOCKING: * PCI/etc. bus probe sem. * * RETURNS: * 0 on success, negative errno otherwise */ int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev) { struct ata_port *ap = link->ap; struct ata_device *dev; int rc = 0, used_dma = 0, found = 0; /* step 1: calculate xfer_mask */ ata_for_each_dev(dev, link, ENABLED) { unsigned int pio_mask, dma_mask; unsigned int mode_mask; mode_mask = ATA_DMA_MASK_ATA; if (dev->class == ATA_DEV_ATAPI) mode_mask = ATA_DMA_MASK_ATAPI; else if (ata_id_is_cfa(dev->id)) mode_mask = ATA_DMA_MASK_CFA; ata_dev_xfermask(dev); ata_force_xfermask(dev); pio_mask = ata_pack_xfermask(dev->pio_mask, 0, 0); if (libata_dma_mask & mode_mask) dma_mask = ata_pack_xfermask(0, dev->mwdma_mask, dev->udma_mask); else dma_mask = 0; dev->pio_mode = ata_xfer_mask2mode(pio_mask); dev->dma_mode = ata_xfer_mask2mode(dma_mask); found = 1; if (ata_dma_enabled(dev)) used_dma = 1; } if (!found) goto out; /* step 2: always set host PIO timings */ ata_for_each_dev(dev, link, ENABLED) { if (dev->pio_mode == 0xff) { ata_dev_warn(dev, "no PIO support\n"); rc = -EINVAL; goto out; } dev->xfer_mode = dev->pio_mode; dev->xfer_shift = ATA_SHIFT_PIO; if (ap->ops->set_piomode) ap->ops->set_piomode(ap, dev); } /* step 3: set host DMA timings */ ata_for_each_dev(dev, link, ENABLED) { if (!ata_dma_enabled(dev)) continue; dev->xfer_mode = dev->dma_mode; dev->xfer_shift = ata_xfer_mode2shift(dev->dma_mode); if (ap->ops->set_dmamode) ap->ops->set_dmamode(ap, dev); } /* step 4: update devices' xfer mode */ ata_for_each_dev(dev, link, ENABLED) { rc = ata_dev_set_mode(dev); if (rc) goto out; } /* Record simplex status. If we selected DMA then the other * host channels are not permitted to do so. */ if (used_dma && (ap->host->flags & ATA_HOST_SIMPLEX)) ap->host->simplex_claimed = ap; out: if (rc) *r_failed_dev = dev; return rc; } EXPORT_SYMBOL_GPL(ata_set_mode); /** * ata_wait_ready - wait for link to become ready * @link: link to be waited on * @deadline: deadline jiffies for the operation * @check_ready: callback to check link readiness * * Wait for @link to become ready. @check_ready should return * positive number if @link is ready, 0 if it isn't, -ENODEV if * link doesn't seem to be occupied, other errno for other error * conditions. * * Transient -ENODEV conditions are allowed for * ATA_TMOUT_FF_WAIT. * * LOCKING: * EH context. * * RETURNS: * 0 if @link is ready before @deadline; otherwise, -errno. */ int ata_wait_ready(struct ata_link *link, unsigned long deadline, int (*check_ready)(struct ata_link *link)) { unsigned long start = jiffies; unsigned long nodev_deadline; int warned = 0; /* choose which 0xff timeout to use, read comment in libata.h */ if (link->ap->host->flags & ATA_HOST_PARALLEL_SCAN) nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT_LONG); else nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT); /* Slave readiness can't be tested separately from master. On * M/S emulation configuration, this function should be called * only on the master and it will handle both master and slave. */ WARN_ON(link == link->ap->slave_link); if (time_after(nodev_deadline, deadline)) nodev_deadline = deadline; while (1) { unsigned long now = jiffies; int ready, tmp; ready = tmp = check_ready(link); if (ready > 0) return 0; /* * -ENODEV could be transient. Ignore -ENODEV if link * is online. Also, some SATA devices take a long * time to clear 0xff after reset. Wait for * ATA_TMOUT_FF_WAIT[_LONG] on -ENODEV if link isn't * offline. * * Note that some PATA controllers (pata_ali) explode * if status register is read more than once when * there's no device attached. */ if (ready == -ENODEV) { if (ata_link_online(link)) ready = 0; else if ((link->ap->flags & ATA_FLAG_SATA) && !ata_link_offline(link) && time_before(now, nodev_deadline)) ready = 0; } if (ready) return ready; if (time_after(now, deadline)) return -EBUSY; if (!warned && time_after(now, start + 5 * HZ) && (deadline - now > 3 * HZ)) { ata_link_warn(link, "link is slow to respond, please be patient " "(ready=%d)\n", tmp); warned = 1; } ata_msleep(link->ap, 50); } } /** * ata_wait_after_reset - wait for link to become ready after reset * @link: link to be waited on * @deadline: deadline jiffies for the operation * @check_ready: callback to check link readiness * * Wait for @link to become ready after reset. * * LOCKING: * EH context. * * RETURNS: * 0 if @link is ready before @deadline; otherwise, -errno. */ int ata_wait_after_reset(struct ata_link *link, unsigned long deadline, int (*check_ready)(struct ata_link *link)) { ata_msleep(link->ap, ATA_WAIT_AFTER_RESET); return ata_wait_ready(link, deadline, check_ready); } EXPORT_SYMBOL_GPL(ata_wait_after_reset); /** * ata_std_prereset - prepare for reset * @link: ATA link to be reset * @deadline: deadline jiffies for the operation * * @link is about to be reset. Initialize it. Failure from * prereset makes libata abort whole reset sequence and give up * that port, so prereset should be best-effort. It does its * best to prepare for reset sequence but if things go wrong, it * should just whine, not fail. * * LOCKING: * Kernel thread context (may sleep) * * RETURNS: * Always 0. */ int ata_std_prereset(struct ata_link *link, unsigned long deadline) { struct ata_port *ap = link->ap; struct ata_eh_context *ehc = &link->eh_context; const unsigned int *timing = sata_ehc_deb_timing(ehc); int rc; /* if we're about to do hardreset, nothing more to do */ if (ehc->i.action & ATA_EH_HARDRESET) return 0; /* if SATA, resume link */ if (ap->flags & ATA_FLAG_SATA) { rc = sata_link_resume(link, timing, deadline); /* whine about phy resume failure but proceed */ if (rc && rc != -EOPNOTSUPP) ata_link_warn(link, "failed to resume link for reset (errno=%d)\n", rc); } /* no point in trying softreset on offline link */ if (ata_phys_link_offline(link)) ehc->i.action &= ~ATA_EH_SOFTRESET; return 0; } EXPORT_SYMBOL_GPL(ata_std_prereset); /** * ata_std_postreset - standard postreset callback * @link: the target ata_link * @classes: classes of attached devices * * This function is invoked after a successful reset. Note that * the device might have been reset more than once using * different reset methods before postreset is invoked. * * LOCKING: * Kernel thread context (may sleep) */ void ata_std_postreset(struct ata_link *link, unsigned int *classes) { u32 serror; /* reset complete, clear SError */ if (!sata_scr_read(link, SCR_ERROR, &serror)) sata_scr_write(link, SCR_ERROR, serror); /* print link status */ sata_print_link_status(link); } EXPORT_SYMBOL_GPL(ata_std_postreset); /** * ata_dev_same_device - Determine whether new ID matches configured device * @dev: device to compare against * @new_class: class of the new device * @new_id: IDENTIFY page of the new device * * Compare @new_class and @new_id against @dev and determine * whether @dev is the device indicated by @new_class and * @new_id. * * LOCKING: * None. * * RETURNS: * 1 if @dev matches @new_class and @new_id, 0 otherwise. */ static int ata_dev_same_device(struct ata_device *dev, unsigned int new_class, const u16 *new_id) { const u16 *old_id = dev->id; unsigned char model[2][ATA_ID_PROD_LEN + 1]; unsigned char serial[2][ATA_ID_SERNO_LEN + 1]; if (dev->class != new_class) { ata_dev_info(dev, "class mismatch %d != %d\n", dev->class, new_class); return 0; } ata_id_c_string(old_id, model[0], ATA_ID_PROD, sizeof(model[0])); ata_id_c_string(new_id, model[1], ATA_ID_PROD, sizeof(model[1])); ata_id_c_string(old_id, serial[0], ATA_ID_SERNO, sizeof(serial[0])); ata_id_c_string(new_id, serial[1], ATA_ID_SERNO, sizeof(serial[1])); if (strcmp(model[0], model[1])) { ata_dev_info(dev, "model number mismatch '%s' != '%s'\n", model[0], model[1]); return 0; } if (strcmp(serial[0], serial[1])) { ata_dev_info(dev, "serial number mismatch '%s' != '%s'\n", serial[0], serial[1]); return 0; } return 1; } /** * ata_dev_reread_id - Re-read IDENTIFY data * @dev: target ATA device * @readid_flags: read ID flags * * Re-read IDENTIFY page and make sure @dev is still attached to * the port. * * LOCKING: * Kernel thread context (may sleep) * * RETURNS: * 0 on success, negative errno otherwise */ int ata_dev_reread_id(struct ata_device *dev, unsigned int readid_flags) { unsigned int class = dev->class; u16 *id = (void *)dev->sector_buf; int rc; /* read ID data */ rc = ata_dev_read_id(dev, &class, readid_flags, id); if (rc) return rc; /* is the device still there? */ if (!ata_dev_same_device(dev, class, id)) return -ENODEV; memcpy(dev->id, id, sizeof(id[0]) * ATA_ID_WORDS); return 0; } /** * ata_dev_revalidate - Revalidate ATA device * @dev: device to revalidate * @new_class: new class code * @readid_flags: read ID flags * * Re-read IDENTIFY page, make sure @dev is still attached to the * port and reconfigure it according to the new IDENTIFY page. * * LOCKING: * Kernel thread context (may sleep) * * RETURNS: * 0 on success, negative errno otherwise */ int ata_dev_revalidate(struct ata_device *dev, unsigned int new_class, unsigned int readid_flags) { u64 n_sectors = dev->n_sectors; u64 n_native_sectors = dev->n_native_sectors; int rc; if (!ata_dev_enabled(dev)) return -ENODEV; /* fail early if !ATA && !ATAPI to avoid issuing [P]IDENTIFY to PMP */ if (ata_class_enabled(new_class) && new_class == ATA_DEV_PMP) { ata_dev_info(dev, "class mismatch %u != %u\n", dev->class, new_class); rc = -ENODEV; goto fail; } /* re-read ID */ rc = ata_dev_reread_id(dev, readid_flags); if (rc) goto fail; /* configure device according to the new ID */ rc = ata_dev_configure(dev); if (rc) goto fail; /* verify n_sectors hasn't changed */ if (dev->class != ATA_DEV_ATA || !n_sectors || dev->n_sectors == n_sectors) return 0; /* n_sectors has changed */ ata_dev_warn(dev, "n_sectors mismatch %llu != %llu\n", (unsigned long long)n_sectors, (unsigned long long)dev->n_sectors); /* * Something could have caused HPA to be unlocked * involuntarily. If n_native_sectors hasn't changed and the * new size matches it, keep the device. */ if (dev->n_native_sectors == n_native_sectors && dev->n_sectors > n_sectors && dev->n_sectors == n_native_sectors) { ata_dev_warn(dev, "new n_sectors matches native, probably " "late HPA unlock, n_sectors updated\n"); /* use the larger n_sectors */ return 0; } /* * Some BIOSes boot w/o HPA but resume w/ HPA locked. Try * unlocking HPA in those cases. * * https://bugzilla.kernel.org/show_bug.cgi?id=15396 */ if (dev->n_native_sectors == n_native_sectors && dev->n_sectors < n_sectors && n_sectors == n_native_sectors && !(dev->quirks & ATA_QUIRK_BROKEN_HPA)) { ata_dev_warn(dev, "old n_sectors matches native, probably " "late HPA lock, will try to unlock HPA\n"); /* try unlocking HPA */ dev->flags |= ATA_DFLAG_UNLOCK_HPA; rc = -EIO; } else rc = -ENODEV; /* restore original n_[native_]sectors and fail */ dev->n_native_sectors = n_native_sectors; dev->n_sectors = n_sectors; fail: ata_dev_err(dev, "revalidation failed (errno=%d)\n", rc); return rc; } static const char * const ata_quirk_names[] = { [__ATA_QUIRK_DIAGNOSTIC] = "diagnostic", [__ATA_QUIRK_NODMA] = "nodma", [__ATA_QUIRK_NONCQ] = "noncq", [__ATA_QUIRK_MAX_SEC_128] = "maxsec128", [__ATA_QUIRK_BROKEN_HPA] = "brokenhpa", [__ATA_QUIRK_DISABLE] = "disable", [__ATA_QUIRK_HPA_SIZE] = "hpasize", [__ATA_QUIRK_IVB] = "ivb", [__ATA_QUIRK_STUCK_ERR] = "stuckerr", [__ATA_QUIRK_BRIDGE_OK] = "bridgeok", [__ATA_QUIRK_ATAPI_MOD16_DMA] = "atapimod16dma", [__ATA_QUIRK_FIRMWARE_WARN] = "firmwarewarn", [__ATA_QUIRK_1_5_GBPS] = "1.5gbps", [__ATA_QUIRK_NOSETXFER] = "nosetxfer", [__ATA_QUIRK_BROKEN_FPDMA_AA] = "brokenfpdmaaa", [__ATA_QUIRK_DUMP_ID] = "dumpid", [__ATA_QUIRK_MAX_SEC_LBA48] = "maxseclba48", [__ATA_QUIRK_ATAPI_DMADIR] = "atapidmadir", [__ATA_QUIRK_NO_NCQ_TRIM] = "noncqtrim", [__ATA_QUIRK_NOLPM] = "nolpm", [__ATA_QUIRK_WD_BROKEN_LPM] = "wdbrokenlpm", [__ATA_QUIRK_ZERO_AFTER_TRIM] = "zeroaftertrim", [__ATA_QUIRK_NO_DMA_LOG] = "nodmalog", [__ATA_QUIRK_NOTRIM] = "notrim", [__ATA_QUIRK_MAX_SEC_1024] = "maxsec1024", [__ATA_QUIRK_MAX_TRIM_128M] = "maxtrim128m", [__ATA_QUIRK_NO_NCQ_ON_ATI] = "noncqonati", [__ATA_QUIRK_NO_LPM_ON_ATI] = "nolpmonati", [__ATA_QUIRK_NO_ID_DEV_LOG] = "noiddevlog", [__ATA_QUIRK_NO_LOG_DIR] = "nologdir", [__ATA_QUIRK_NO_FUA] = "nofua", }; static void ata_dev_print_quirks(const struct ata_device *dev, const char *model, const char *rev, unsigned int quirks) { struct ata_eh_context *ehc = &dev->link->eh_context; int n = 0, i; size_t sz; char *str; if (!ata_dev_print_info(dev) || ehc->i.flags & ATA_EHI_DID_PRINT_QUIRKS) return; ehc->i.flags |= ATA_EHI_DID_PRINT_QUIRKS; if (!quirks) return; sz = 64 + ARRAY_SIZE(ata_quirk_names) * 16; str = kmalloc(sz, GFP_KERNEL); if (!str) return; n = snprintf(str, sz, "Model '%s', rev '%s', applying quirks:", model, rev); for (i = 0; i < ARRAY_SIZE(ata_quirk_names); i++) { if (quirks & (1U << i)) n += snprintf(str + n, sz - n, " %s", ata_quirk_names[i]); } ata_dev_warn(dev, "%s\n", str); kfree(str); } struct ata_dev_quirks_entry { const char *model_num; const char *model_rev; unsigned int quirks; }; static const struct ata_dev_quirks_entry __ata_dev_quirks[] = { /* Devices with DMA related problems under Linux */ { "WDC AC11000H", NULL, ATA_QUIRK_NODMA }, { "WDC AC22100H", NULL, ATA_QUIRK_NODMA }, { "WDC AC32500H", NULL, ATA_QUIRK_NODMA }, { "WDC AC33100H", NULL, ATA_QUIRK_NODMA }, { "WDC AC31600H", NULL, ATA_QUIRK_NODMA }, { "WDC AC32100H", "24.09P07", ATA_QUIRK_NODMA }, { "WDC AC23200L", "21.10N21", ATA_QUIRK_NODMA }, { "Compaq CRD-8241B", NULL, ATA_QUIRK_NODMA }, { "CRD-8400B", NULL, ATA_QUIRK_NODMA }, { "CRD-848[02]B", NULL, ATA_QUIRK_NODMA }, { "CRD-84", NULL, ATA_QUIRK_NODMA }, { "SanDisk SDP3B", NULL, ATA_QUIRK_NODMA }, { "SanDisk SDP3B-64", NULL, ATA_QUIRK_NODMA }, { "SANYO CD-ROM CRD", NULL, ATA_QUIRK_NODMA }, { "HITACHI CDR-8", NULL, ATA_QUIRK_NODMA }, { "HITACHI CDR-8[34]35", NULL, ATA_QUIRK_NODMA }, { "Toshiba CD-ROM XM-6202B", NULL, ATA_QUIRK_NODMA }, { "TOSHIBA CD-ROM XM-1702BC", NULL, ATA_QUIRK_NODMA }, { "CD-532E-A", NULL, ATA_QUIRK_NODMA }, { "E-IDE CD-ROM CR-840", NULL, ATA_QUIRK_NODMA }, { "CD-ROM Drive/F5A", NULL, ATA_QUIRK_NODMA }, { "WPI CDD-820", NULL, ATA_QUIRK_NODMA }, { "SAMSUNG CD-ROM SC-148C", NULL, ATA_QUIRK_NODMA }, { "SAMSUNG CD-ROM SC", NULL, ATA_QUIRK_NODMA }, { "ATAPI CD-ROM DRIVE 40X MAXIMUM", NULL, ATA_QUIRK_NODMA }, { "_NEC DV5800A", NULL, ATA_QUIRK_NODMA }, { "SAMSUNG CD-ROM SN-124", "N001", ATA_QUIRK_NODMA }, { "Seagate STT20000A", NULL, ATA_QUIRK_NODMA }, { " 2GB ATA Flash Disk", "ADMA428M", ATA_QUIRK_NODMA }, { "VRFDFC22048UCHC-TE*", NULL, ATA_QUIRK_NODMA }, /* Odd clown on sil3726/4726 PMPs */ { "Config Disk", NULL, ATA_QUIRK_DISABLE }, /* Similar story with ASMedia 1092 */ { "ASMT109x- Config", NULL, ATA_QUIRK_DISABLE }, /* Weird ATAPI devices */ { "TORiSAN DVD-ROM DRD-N216", NULL, ATA_QUIRK_MAX_SEC_128 }, { "QUANTUM DAT DAT72-000", NULL, ATA_QUIRK_ATAPI_MOD16_DMA }, { "Slimtype DVD A DS8A8SH", NULL, ATA_QUIRK_MAX_SEC_LBA48 }, { "Slimtype DVD A DS8A9SH", NULL, ATA_QUIRK_MAX_SEC_LBA48 }, /* * Causes silent data corruption with higher max sects. * http://lkml.kernel.org/g/x49wpy40ysk.fsf@segfault.boston.devel.redhat.com */ { "ST380013AS", "3.20", ATA_QUIRK_MAX_SEC_1024 }, /* * These devices time out with higher max sects. * https://bugzilla.kernel.org/show_bug.cgi?id=121671 */ { "LITEON CX1-JB*-HP", NULL, ATA_QUIRK_MAX_SEC_1024 }, { "LITEON EP1-*", NULL, ATA_QUIRK_MAX_SEC_1024 }, /* Devices we expect to fail diagnostics */ /* Devices where NCQ should be avoided */ /* NCQ is slow */ { "WDC WD740ADFD-00", NULL, ATA_QUIRK_NONCQ }, { "WDC WD740ADFD-00NLR1", NULL, ATA_QUIRK_NONCQ }, /* http://thread.gmane.org/gmane.linux.ide/14907 */ { "FUJITSU MHT2060BH", NULL, ATA_QUIRK_NONCQ }, /* NCQ is broken */ { "Maxtor *", "BANC*", ATA_QUIRK_NONCQ }, { "Maxtor 7V300F0", "VA111630", ATA_QUIRK_NONCQ }, { "ST380817AS", "3.42", ATA_QUIRK_NONCQ }, { "ST3160023AS", "3.42", ATA_QUIRK_NONCQ }, { "OCZ CORE_SSD", "02.10104", ATA_QUIRK_NONCQ }, /* Seagate NCQ + FLUSH CACHE firmware bug */ { "ST31500341AS", "SD1[5-9]", ATA_QUIRK_NONCQ | ATA_QUIRK_FIRMWARE_WARN }, { "ST31000333AS", "SD1[5-9]", ATA_QUIRK_NONCQ | ATA_QUIRK_FIRMWARE_WARN }, { "ST3640[36]23AS", "SD1[5-9]", ATA_QUIRK_NONCQ | ATA_QUIRK_FIRMWARE_WARN }, { "ST3320[68]13AS", "SD1[5-9]", ATA_QUIRK_NONCQ | ATA_QUIRK_FIRMWARE_WARN }, /* drives which fail FPDMA_AA activation (some may freeze afterwards) the ST disks also have LPM issues */ { "ST1000LM024 HN-M101MBB", NULL, ATA_QUIRK_BROKEN_FPDMA_AA | ATA_QUIRK_NOLPM }, { "VB0250EAVER", "HPG7", ATA_QUIRK_BROKEN_FPDMA_AA }, /* Blacklist entries taken from Silicon Image 3124/3132 Windows driver .inf file - also several Linux problem reports */ { "HTS541060G9SA00", "MB3OC60D", ATA_QUIRK_NONCQ }, { "HTS541080G9SA00", "MB4OC60D", ATA_QUIRK_NONCQ }, { "HTS541010G9SA00", "MBZOC60D", ATA_QUIRK_NONCQ }, /* https://bugzilla.kernel.org/show_bug.cgi?id=15573 */ { "C300-CTFDDAC128MAG", "0001", ATA_QUIRK_NONCQ }, /* Sandisk SD7/8/9s lock up hard on large trims */ { "SanDisk SD[789]*", NULL, ATA_QUIRK_MAX_TRIM_128M }, /* devices which puke on READ_NATIVE_MAX */ { "HDS724040KLSA80", "KFAOA20N", ATA_QUIRK_BROKEN_HPA }, { "WDC WD3200JD-00KLB0", "WD-WCAMR1130137", ATA_QUIRK_BROKEN_HPA }, { "WDC WD2500JD-00HBB0", "WD-WMAL71490727", ATA_QUIRK_BROKEN_HPA }, { "MAXTOR 6L080L4", "A93.0500", ATA_QUIRK_BROKEN_HPA }, /* this one allows HPA unlocking but fails IOs on the area */ { "OCZ-VERTEX", "1.30", ATA_QUIRK_BROKEN_HPA }, /* Devices which report 1 sector over size HPA */ { "ST340823A", NULL, ATA_QUIRK_HPA_SIZE }, { "ST320413A", NULL, ATA_QUIRK_HPA_SIZE }, { "ST310211A", NULL, ATA_QUIRK_HPA_SIZE }, /* Devices which get the IVB wrong */ { "QUANTUM FIREBALLlct10 05", "A03.0900", ATA_QUIRK_IVB }, /* Maybe we should just add all TSSTcorp devices... */ { "TSSTcorp CDDVDW SH-S202[HJN]", "SB0[01]", ATA_QUIRK_IVB }, /* Devices that do not need bridging limits applied */ { "MTRON MSP-SATA*", NULL, ATA_QUIRK_BRIDGE_OK }, { "BUFFALO HD-QSU2/R5", NULL, ATA_QUIRK_BRIDGE_OK }, /* Devices which aren't very happy with higher link speeds */ { "WD My Book", NULL, ATA_QUIRK_1_5_GBPS }, { "Seagate FreeAgent GoFlex", NULL, ATA_QUIRK_1_5_GBPS }, /* * Devices which choke on SETXFER. Applies only if both the * device and controller are SATA. */ { "PIONEER DVD-RW DVRTD08", NULL, ATA_QUIRK_NOSETXFER }, { "PIONEER DVD-RW DVRTD08A", NULL, ATA_QUIRK_NOSETXFER }, { "PIONEER DVD-RW DVR-215", NULL, ATA_QUIRK_NOSETXFER }, { "PIONEER DVD-RW DVR-212D", NULL, ATA_QUIRK_NOSETXFER }, { "PIONEER DVD-RW DVR-216D", NULL, ATA_QUIRK_NOSETXFER }, /* These specific Pioneer models have LPM issues */ { "PIONEER BD-RW BDR-207M", NULL, ATA_QUIRK_NOLPM }, { "PIONEER BD-RW BDR-205", NULL, ATA_QUIRK_NOLPM }, /* Crucial devices with broken LPM support */ { "CT*0BX*00SSD1", NULL, ATA_QUIRK_NOLPM }, /* 512GB MX100 with MU01 firmware has both queued TRIM and LPM issues */ { "Crucial_CT512MX100*", "MU01", ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NOLPM }, /* 512GB MX100 with newer firmware has only LPM issues */ { "Crucial_CT512MX100*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NOLPM }, /* 480GB+ M500 SSDs have both queued TRIM and LPM issues */ { "Crucial_CT480M500*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NOLPM }, { "Crucial_CT960M500*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NOLPM }, /* AMD Radeon devices with broken LPM support */ { "R3SL240G", NULL, ATA_QUIRK_NOLPM }, /* Apacer models with LPM issues */ { "Apacer AS340*", NULL, ATA_QUIRK_NOLPM }, /* These specific Samsung models/firmware-revs do not handle LPM well */ { "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_QUIRK_NOLPM }, { "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_QUIRK_NOLPM }, { "SAMSUNG MZ7TD256HAFV-000L9", NULL, ATA_QUIRK_NOLPM }, { "SAMSUNG MZ7TE512HMHP-000L1", "EXT06L0Q", ATA_QUIRK_NOLPM }, /* devices that don't properly handle queued TRIM commands */ { "Micron_M500IT_*", "MU01", ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Micron_M500_*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Micron_M5[15]0_*", "MU01", ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Micron_1100_*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM, }, { "Crucial_CT*M500*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Crucial_CT*M550*", "MU01", ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Crucial_CT*MX100*", "MU01", ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Samsung SSD 840 EVO*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_NO_DMA_LOG | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Samsung SSD 840*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Samsung SSD 850*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, { "Samsung SSD 860*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI | ATA_QUIRK_NO_LPM_ON_ATI }, { "Samsung SSD 870*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI | ATA_QUIRK_NO_LPM_ON_ATI }, { "SAMSUNG*MZ7LH*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI | ATA_QUIRK_NO_LPM_ON_ATI }, { "FCCT*M500*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, /* devices that don't properly handle TRIM commands */ { "SuperSSpeed S238*", NULL, ATA_QUIRK_NOTRIM }, { "M88V29*", NULL, ATA_QUIRK_NOTRIM }, /* * As defined, the DRAT (Deterministic Read After Trim) and RZAT * (Return Zero After Trim) flags in the ATA Command Set are * unreliable in the sense that they only define what happens if * the device successfully executed the DSM TRIM command. TRIM * is only advisory, however, and the device is free to silently * ignore all or parts of the request. * * Whitelist drives that are known to reliably return zeroes * after TRIM. */ /* * The intel 510 drive has buggy DRAT/RZAT. Explicitly exclude * that model before whitelisting all other intel SSDs. */ { "INTEL*SSDSC2MH*", NULL, 0 }, { "Micron*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, { "Crucial*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, { "INTEL*SSD*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, { "SSD*INTEL*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, { "Samsung*SSD*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, { "SAMSUNG*SSD*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, { "SAMSUNG*MZ7KM*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, { "ST[1248][0248]0[FH]*", NULL, ATA_QUIRK_ZERO_AFTER_TRIM }, /* * Some WD SATA-I drives spin up and down erratically when the link * is put into the slumber mode. We don't have full list of the * affected devices. Disable LPM if the device matches one of the * known prefixes and is SATA-1. As a side effect LPM partial is * lost too. * * https://bugzilla.kernel.org/show_bug.cgi?id=57211 */ { "WDC WD800JD-*", NULL, ATA_QUIRK_WD_BROKEN_LPM }, { "WDC WD1200JD-*", NULL, ATA_QUIRK_WD_BROKEN_LPM }, { "WDC WD1600JD-*", NULL, ATA_QUIRK_WD_BROKEN_LPM }, { "WDC WD2000JD-*", NULL, ATA_QUIRK_WD_BROKEN_LPM }, { "WDC WD2500JD-*", NULL, ATA_QUIRK_WD_BROKEN_LPM }, { "WDC WD3000JD-*", NULL, ATA_QUIRK_WD_BROKEN_LPM }, { "WDC WD3200JD-*", NULL, ATA_QUIRK_WD_BROKEN_LPM }, /* * This sata dom device goes on a walkabout when the ATA_LOG_DIRECTORY * log page is accessed. Ensure we never ask for this log page with * these devices. */ { "SATADOM-ML 3ME", NULL, ATA_QUIRK_NO_LOG_DIR }, /* Buggy FUA */ { "Maxtor", "BANC1G10", ATA_QUIRK_NO_FUA }, { "WDC*WD2500J*", NULL, ATA_QUIRK_NO_FUA }, { "OCZ-VERTEX*", NULL, ATA_QUIRK_NO_FUA }, { "INTEL*SSDSC2CT*", NULL, ATA_QUIRK_NO_FUA }, /* End Marker */ { } }; static unsigned int ata_dev_quirks(const struct ata_device *dev) { unsigned char model_num[ATA_ID_PROD_LEN + 1]; unsigned char model_rev[ATA_ID_FW_REV_LEN + 1]; const struct ata_dev_quirks_entry *ad = __ata_dev_quirks; /* dev->quirks is an unsigned int. */ BUILD_BUG_ON(__ATA_QUIRK_MAX > 32); ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num)); ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev)); while (ad->model_num) { if (glob_match(ad->model_num, model_num) && (!ad->model_rev || glob_match(ad->model_rev, model_rev))) { ata_dev_print_quirks(dev, model_num, model_rev, ad->quirks); return ad->quirks; } ad++; } return 0; } static bool ata_dev_nodma(const struct ata_device *dev) { /* * We do not support polling DMA. Deny DMA for those ATAPI devices * with CDB-intr (and use PIO) if the LLDD handles only interrupts in * the HSM_ST_LAST state. */ if ((dev->link->ap->flags & ATA_FLAG_PIO_POLLING) && (dev->flags & ATA_DFLAG_CDB_INTR)) return true; return dev->quirks & ATA_QUIRK_NODMA; } /** * ata_is_40wire - check drive side detection * @dev: device * * Perform drive side detection decoding, allowing for device vendors * who can't follow the documentation. */ static int ata_is_40wire(struct ata_device *dev) { if (dev->quirks & ATA_QUIRK_IVB) return ata_drive_40wire_relaxed(dev->id); return ata_drive_40wire(dev->id); } /** * cable_is_40wire - 40/80/SATA decider * @ap: port to consider * * This function encapsulates the policy for speed management * in one place. At the moment we don't cache the result but * there is a good case for setting ap->cbl to the result when * we are called with unknown cables (and figuring out if it * impacts hotplug at all). * * Return 1 if the cable appears to be 40 wire. */ static int cable_is_40wire(struct ata_port *ap) { struct ata_link *link; struct ata_device *dev; /* If the controller thinks we are 40 wire, we are. */ if (ap->cbl == ATA_CBL_PATA40) return 1; /* If the controller thinks we are 80 wire, we are. */ if (ap->cbl == ATA_CBL_PATA80 || ap->cbl == ATA_CBL_SATA) return 0; /* If the system is known to be 40 wire short cable (eg * laptop), then we allow 80 wire modes even if the drive * isn't sure. */ if (ap->cbl == ATA_CBL_PATA40_SHORT) return 0; /* If the controller doesn't know, we scan. * * Note: We look for all 40 wire detects at this point. Any * 80 wire detect is taken to be 80 wire cable because * - in many setups only the one drive (slave if present) will * give a valid detect * - if you have a non detect capable drive you don't want it * to colour the choice */ ata_for_each_link(link, ap, EDGE) { ata_for_each_dev(dev, link, ENABLED) { if (!ata_is_40wire(dev)) return 0; } } return 1; } /** * ata_dev_xfermask - Compute supported xfermask of the given device * @dev: Device to compute xfermask for * * Compute supported xfermask of @dev and store it in * dev->*_mask. This function is responsible for applying all * known limits including host controller limits, device quirks, etc... * * LOCKING: * None. */ static void ata_dev_xfermask(struct ata_device *dev) { struct ata_link *link = dev->link; struct ata_port *ap = link->ap; struct ata_host *host = ap->host; unsigned int xfer_mask; /* controller modes available */ xfer_mask = ata_pack_xfermask(ap->pio_mask, ap->mwdma_mask, ap->udma_mask); /* drive modes available */ xfer_mask &= ata_pack_xfermask(dev->pio_mask, dev->mwdma_mask, dev->udma_mask); xfer_mask &= ata_id_xfermask(dev->id); /* * CFA Advanced TrueIDE timings are not allowed on a shared * cable */ if (ata_dev_pair(dev)) { /* No PIO5 or PIO6 */ xfer_mask &= ~(0x03 << (ATA_SHIFT_PIO + 5)); /* No MWDMA3 or MWDMA 4 */ xfer_mask &= ~(0x03 << (ATA_SHIFT_MWDMA + 3)); } if (ata_dev_nodma(dev)) { xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA); ata_dev_warn(dev, "device does not support DMA, disabling DMA\n"); } if ((host->flags & ATA_HOST_SIMPLEX) && host->simplex_claimed && host->simplex_claimed != ap) { xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA); ata_dev_warn(dev, "simplex DMA is claimed by other device, disabling DMA\n"); } if (ap->flags & ATA_FLAG_NO_IORDY) xfer_mask &= ata_pio_mask_no_iordy(dev); if (ap->ops->mode_filter) xfer_mask = ap->ops->mode_filter(dev, xfer_mask); /* Apply cable rule here. Don't apply it early because when * we handle hot plug the cable type can itself change. * Check this last so that we know if the transfer rate was * solely limited by the cable. * Unknown or 80 wire cables reported host side are checked * drive side as well. Cases where we know a 40wire cable * is used safely for 80 are not checked here. */ if (xfer_mask & (0xF8 << ATA_SHIFT_UDMA)) /* UDMA/44 or higher would be available */ if (cable_is_40wire(ap)) { ata_dev_warn(dev, "limited to UDMA/33 due to 40-wire cable\n"); xfer_mask &= ~(0xF8 << ATA_SHIFT_UDMA); } ata_unpack_xfermask(xfer_mask, &dev->pio_mask, &dev->mwdma_mask, &dev->udma_mask); } /** * ata_dev_set_xfermode - Issue SET FEATURES - XFER MODE command * @dev: Device to which command will be sent * * Issue SET FEATURES - XFER MODE command to device @dev * on port @ap. * * LOCKING: * PCI/etc. bus probe sem. * * RETURNS: * 0 on success, AC_ERR_* mask otherwise. */ static unsigned int ata_dev_set_xfermode(struct ata_device *dev) { struct ata_taskfile tf; /* set up set-features taskfile */ ata_dev_dbg(dev, "set features - xfer mode\n"); /* Some controllers and ATAPI devices show flaky interrupt * behavior after setting xfer mode. Use polling instead. */ ata_tf_init(dev, &tf); tf.command = ATA_CMD_SET_FEATURES; tf.feature = SETFEATURES_XFER; tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_POLLING; tf.protocol = ATA_PROT_NODATA; /* If we are using IORDY we must send the mode setting command */ if (ata_pio_need_iordy(dev)) tf.nsect = dev->xfer_mode; /* If the device has IORDY and the controller does not - turn it off */ else if (ata_id_has_iordy(dev->id)) tf.nsect = 0x01; else /* In the ancient relic department - skip all of this */ return 0; /* * On some disks, this command causes spin-up, so we need longer * timeout. */ return ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 15000); } /** * ata_dev_set_feature - Issue SET FEATURES * @dev: Device to which command will be sent * @subcmd: The SET FEATURES subcommand to be sent * @action: The sector count represents a subcommand specific action * * Issue SET FEATURES command to device @dev on port @ap with sector count * * LOCKING: * PCI/etc. bus probe sem. * * RETURNS: * 0 on success, AC_ERR_* mask otherwise. */ unsigned int ata_dev_set_feature(struct ata_device *dev, u8 subcmd, u8 action) { struct ata_taskfile tf; unsigned int timeout = 0; /* set up set-features taskfile */ ata_dev_dbg(dev, "set features\n"); ata_tf_init(dev, &tf); tf.command = ATA_CMD_SET_FEATURES; tf.feature = subcmd; tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE; tf.protocol = ATA_PROT_NODATA; tf.nsect = action; if (subcmd == SETFEATURES_SPINUP) timeout = ata_probe_timeout ? ata_probe_timeout * 1000 : SETFEATURES_SPINUP_TIMEOUT; return ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, timeout); } EXPORT_SYMBOL_GPL(ata_dev_set_feature); /** * ata_dev_init_params - Issue INIT DEV PARAMS command * @dev: Device to which command will be sent * @heads: Number of heads (taskfile parameter) * @sectors: Number of sectors (taskfile parameter) * * LOCKING: * Kernel thread context (may sleep) * * RETURNS: * 0 on success, AC_ERR_* mask otherwise. */ static unsigned int ata_dev_init_params(struct ata_device *dev, u16 heads, u16 sectors) { struct ata_taskfile tf; unsigned int err_mask; /* Number of sectors per track 1-255. Number of heads 1-16 */ if (sectors < 1 || sectors > 255 || heads < 1 || heads > 16) return AC_ERR_INVALID; /* set up init dev params taskfile */ ata_dev_dbg(dev, "init dev params\n"); ata_tf_init(dev, &tf); tf.command = ATA_CMD_INIT_DEV_PARAMS; tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE; tf.protocol = ATA_PROT_NODATA; tf.nsect = sectors; tf.device |= (heads - 1) & 0x0f; /* max head = num. of heads - 1 */ err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); /* A clean abort indicates an original or just out of spec drive and we should continue as we issue the setup based on the drive reported working geometry */ if (err_mask == AC_ERR_DEV && (tf.error & ATA_ABORTED)) err_mask = 0; return err_mask; } /** * atapi_check_dma - Check whether ATAPI DMA can be supported * @qc: Metadata associated with taskfile to check * * Allow low-level driver to filter ATA PACKET commands, returning * a status indicating whether or not it is OK to use DMA for the * supplied PACKET command. * * LOCKING: * spin_lock_irqsave(host lock) * * RETURNS: 0 when ATAPI DMA can be used * nonzero otherwise */ int atapi_check_dma(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; /* Don't allow DMA if it isn't multiple of 16 bytes. Quite a * few ATAPI devices choke on such DMA requests. */ if (!(qc->dev->quirks & ATA_QUIRK_ATAPI_MOD16_DMA) && unlikely(qc->nbytes & 15)) return -EOPNOTSUPP; if (ap->ops->check_atapi_dma) return ap->ops->check_atapi_dma(qc); return 0; } /** * ata_std_qc_defer - Check whether a qc needs to be deferred * @qc: ATA command in question * * Non-NCQ commands cannot run with any other command, NCQ or * not. As upper layer only knows the queue depth, we are * responsible for maintaining exclusion. This function checks * whether a new command @qc can be issued. * * LOCKING: * spin_lock_irqsave(host lock) * * RETURNS: * ATA_DEFER_* if deferring is needed, 0 otherwise. */ int ata_std_qc_defer(struct ata_queued_cmd *qc) { struct ata_link *link = qc->dev->link; if (ata_is_ncq(qc->tf.protocol)) { if (!ata_tag_valid(link->active_tag)) return 0; } else { if (!ata_tag_valid(link->active_tag) && !link->sactive) return 0; } return ATA_DEFER_LINK; } EXPORT_SYMBOL_GPL(ata_std_qc_defer); /** * ata_sg_init - Associate command with scatter-gather table. * @qc: Command to be associated * @sg: Scatter-gather table. * @n_elem: Number of elements in s/g table. * * Initialize the data-related elements of queued_cmd @qc * to point to a scatter-gather table @sg, containing @n_elem * elements. * * LOCKING: * spin_lock_irqsave(host lock) */ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg, unsigned int n_elem) { qc->sg = sg; qc->n_elem = n_elem; qc->cursg = qc->sg; } #ifdef CONFIG_HAS_DMA /** * ata_sg_clean - Unmap DMA memory associated with command * @qc: Command containing DMA memory to be released * * Unmap all mapped DMA memory associated with this command. * * LOCKING: * spin_lock_irqsave(host lock) */ static void ata_sg_clean(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; struct scatterlist *sg = qc->sg; int dir = qc->dma_dir; WARN_ON_ONCE(sg == NULL); if (qc->n_elem) dma_unmap_sg(ap->dev, sg, qc->orig_n_elem, dir); qc->flags &= ~ATA_QCFLAG_DMAMAP; qc->sg = NULL; } /** * ata_sg_setup - DMA-map the scatter-gather table associated with a command. * @qc: Command with scatter-gather table to be mapped. * * DMA-map the scatter-gather table associated with queued_cmd @qc. * * LOCKING: * spin_lock_irqsave(host lock) * * RETURNS: * Zero on success, negative on error. * */ static int ata_sg_setup(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; unsigned int n_elem; n_elem = dma_map_sg(ap->dev, qc->sg, qc->n_elem, qc->dma_dir); if (n_elem < 1) return -1; qc->orig_n_elem = qc->n_elem; qc->n_elem = n_elem; qc->flags |= ATA_QCFLAG_DMAMAP; return 0; } #else /* !CONFIG_HAS_DMA */ static inline void ata_sg_clean(struct ata_queued_cmd *qc) {} static inline int ata_sg_setup(struct ata_queued_cmd *qc) { return -1; } #endif /* !CONFIG_HAS_DMA */ /** * swap_buf_le16 - swap halves of 16-bit words in place * @buf: Buffer to swap * @buf_words: Number of 16-bit words in buffer. * * Swap halves of 16-bit words if needed to convert from * little-endian byte order to native cpu byte order, or * vice-versa. * * LOCKING: * Inherited from caller. */ void swap_buf_le16(u16 *buf, unsigned int buf_words) { #ifdef __BIG_ENDIAN unsigned int i; for (i = 0; i < buf_words; i++) buf[i] = le16_to_cpu(buf[i]); #endif /* __BIG_ENDIAN */ } /** * ata_qc_free - free unused ata_queued_cmd * @qc: Command to complete * * Designed to free unused ata_queued_cmd object * in case something prevents using it. * * LOCKING: * spin_lock_irqsave(host lock) */ void ata_qc_free(struct ata_queued_cmd *qc) { qc->flags = 0; if (ata_tag_valid(qc->tag)) qc->tag = ATA_TAG_POISON; } void __ata_qc_complete(struct ata_queued_cmd *qc) { struct ata_port *ap; struct ata_link *link; if (WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE))) return; ap = qc->ap; link = qc->dev->link; if (likely(qc->flags & ATA_QCFLAG_DMAMAP)) ata_sg_clean(qc); /* command should be marked inactive atomically with qc completion */ if (ata_is_ncq(qc->tf.protocol)) { link->sactive &= ~(1 << qc->hw_tag); if (!link->sactive) ap->nr_active_links--; } else { link->active_tag = ATA_TAG_POISON; ap->nr_active_links--; } /* clear exclusive status */ if (unlikely(qc->flags & ATA_QCFLAG_CLEAR_EXCL && ap->excl_link == link)) ap->excl_link = NULL; /* * Mark qc as inactive to prevent the port interrupt handler from * completing the command twice later, before the error handler is * called. */ qc->flags &= ~ATA_QCFLAG_ACTIVE; ap->qc_active &= ~(1ULL << qc->tag); /* call completion callback */ qc->complete_fn(qc); } static void fill_result_tf(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; /* * rtf may already be filled (e.g. for successful NCQ commands). * If that is the case, we have nothing to do. */ if (qc->flags & ATA_QCFLAG_RTF_FILLED) return; qc->result_tf.flags = qc->tf.flags; ap->ops->qc_fill_rtf(qc); qc->flags |= ATA_QCFLAG_RTF_FILLED; } static void ata_verify_xfer(struct ata_queued_cmd *qc) { struct ata_device *dev = qc->dev; if (!ata_is_data(qc->tf.protocol)) return; if ((dev->mwdma_mask || dev->udma_mask) && ata_is_pio(qc->tf.protocol)) return; dev->flags &= ~ATA_DFLAG_DUBIOUS_XFER; } /** * ata_qc_complete - Complete an active ATA command * @qc: Command to complete * * Indicate to the mid and upper layers that an ATA command has * completed, with either an ok or not-ok status. * * Refrain from calling this function multiple times when * successfully completing multiple NCQ commands. * ata_qc_complete_multiple() should be used instead, which will * properly update IRQ expect state. * * LOCKING: * spin_lock_irqsave(host lock) */ void ata_qc_complete(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; struct ata_device *dev = qc->dev; struct ata_eh_info *ehi = &dev->link->eh_info; /* Trigger the LED (if available) */ ledtrig_disk_activity(!!(qc->tf.flags & ATA_TFLAG_WRITE)); /* * In order to synchronize EH with the regular execution path, a qc that * is owned by EH is marked with ATA_QCFLAG_EH. * * The normal execution path is responsible for not accessing a qc owned * by EH. libata core enforces the rule by returning NULL from * ata_qc_from_tag() for qcs owned by EH. */ if (unlikely(qc->err_mask)) qc->flags |= ATA_QCFLAG_EH; /* * Finish internal commands without any further processing and always * with the result TF filled. */ if (unlikely(ata_tag_internal(qc->tag))) { fill_result_tf(qc); trace_ata_qc_complete_internal(qc); __ata_qc_complete(qc); return; } /* Non-internal qc has failed. Fill the result TF and summon EH. */ if (unlikely(qc->flags & ATA_QCFLAG_EH)) { fill_result_tf(qc); trace_ata_qc_complete_failed(qc); ata_qc_schedule_eh(qc); return; } WARN_ON_ONCE(ata_port_is_frozen(ap)); /* read result TF if requested */ if (qc->flags & ATA_QCFLAG_RESULT_TF) fill_result_tf(qc); trace_ata_qc_complete_done(qc); /* * For CDL commands that completed without an error, check if we have * sense data (ATA_SENSE is set). If we do, then the command may have * been aborted by the device due to a limit timeout using the policy * 0xD. For these commands, invoke EH to get the command sense data. */ if (qc->flags & ATA_QCFLAG_HAS_CDL && qc->result_tf.status & ATA_SENSE) { /* * Tell SCSI EH to not overwrite scmd->result even if this * command is finished with result SAM_STAT_GOOD. */ qc->scsicmd->flags |= SCMD_FORCE_EH_SUCCESS; qc->flags |= ATA_QCFLAG_EH_SUCCESS_CMD; ehi->dev_action[dev->devno] |= ATA_EH_GET_SUCCESS_SENSE; /* * set pending so that ata_qc_schedule_eh() does not trigger * fast drain, and freeze the port. */ ap->pflags |= ATA_PFLAG_EH_PENDING; ata_qc_schedule_eh(qc); return; } /* Some commands need post-processing after successful completion. */ switch (qc->tf.command) { case ATA_CMD_SET_FEATURES: if (qc->tf.feature != SETFEATURES_WC_ON && qc->tf.feature != SETFEATURES_WC_OFF && qc->tf.feature != SETFEATURES_RA_ON && qc->tf.feature != SETFEATURES_RA_OFF) break; fallthrough; case ATA_CMD_INIT_DEV_PARAMS: /* CHS translation changed */ case ATA_CMD_SET_MULTI: /* multi_count changed */ /* revalidate device */ ehi->dev_action[dev->devno] |= ATA_EH_REVALIDATE; ata_port_schedule_eh(ap); break; case ATA_CMD_SLEEP: dev->flags |= ATA_DFLAG_SLEEPING; break; } if (unlikely(dev->flags & ATA_DFLAG_DUBIOUS_XFER)) ata_verify_xfer(qc); __ata_qc_complete(qc); } EXPORT_SYMBOL_GPL(ata_qc_complete); /** * ata_qc_get_active - get bitmask of active qcs * @ap: port in question * * LOCKING: * spin_lock_irqsave(host lock) * * RETURNS: * Bitmask of active qcs */ u64 ata_qc_get_active(struct ata_port *ap) { u64 qc_active = ap->qc_active; /* ATA_TAG_INTERNAL is sent to hw as tag 0 */ if (qc_active & (1ULL << ATA_TAG_INTERNAL)) { qc_active |= (1 << 0); qc_active &= ~(1ULL << ATA_TAG_INTERNAL); } return qc_active; } EXPORT_SYMBOL_GPL(ata_qc_get_active); /** * ata_qc_issue - issue taskfile to device * @qc: command to issue to device * * Prepare an ATA command to submission to device. * This includes mapping the data into a DMA-able * area, filling in the S/G table, and finally * writing the taskfile to hardware, starting the command. * * LOCKING: * spin_lock_irqsave(host lock) */ void ata_qc_issue(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; struct ata_link *link = qc->dev->link; u8 prot = qc->tf.protocol; /* Make sure only one non-NCQ command is outstanding. */ WARN_ON_ONCE(ata_tag_valid(link->active_tag)); if (ata_is_ncq(prot)) { WARN_ON_ONCE(link->sactive & (1 << qc->hw_tag)); if (!link->sactive) ap->nr_active_links++; link->sactive |= 1 << qc->hw_tag; } else { WARN_ON_ONCE(link->sactive); ap->nr_active_links++; link->active_tag = qc->tag; } qc->flags |= ATA_QCFLAG_ACTIVE; ap->qc_active |= 1ULL << qc->tag; /* * We guarantee to LLDs that they will have at least one * non-zero sg if the command is a data command. */ if (ata_is_data(prot) && (!qc->sg || !qc->n_elem || !qc->nbytes)) goto sys_err; if (ata_is_dma(prot) || (ata_is_pio(prot) && (ap->flags & ATA_FLAG_PIO_DMA))) if (ata_sg_setup(qc)) goto sys_err; /* if device is sleeping, schedule reset and abort the link */ if (unlikely(qc->dev->flags & ATA_DFLAG_SLEEPING)) { link->eh_info.action |= ATA_EH_RESET; ata_ehi_push_desc(&link->eh_info, "waking up from sleep"); ata_link_abort(link); return; } if (ap->ops->qc_prep) { trace_ata_qc_prep(qc); qc->err_mask |= ap->ops->qc_prep(qc); if (unlikely(qc->err_mask)) goto err; } trace_ata_qc_issue(qc); qc->err_mask |= ap->ops->qc_issue(qc); if (unlikely(qc->err_mask)) goto err; return; sys_err: qc->err_mask |= AC_ERR_SYSTEM; err: ata_qc_complete(qc); } /** * ata_phys_link_online - test whether the given link is online * @link: ATA link to test * * Test whether @link is online. Note that this function returns * 0 if online status of @link cannot be obtained, so * ata_link_online(link) != !ata_link_offline(link). * * LOCKING: * None. * * RETURNS: * True if the port online status is available and online. */ bool ata_phys_link_online(struct ata_link *link) { u32 sstatus; if (sata_scr_read(link, SCR_STATUS, &sstatus) == 0 && ata_sstatus_online(sstatus)) return true; return false; } /** * ata_phys_link_offline - test whether the given link is offline * @link: ATA link to test * * Test whether @link is offline. Note that this function * returns 0 if offline status of @link cannot be obtained, so * ata_link_online(link) != !ata_link_offline(link). * * LOCKING: * None. * * RETURNS: * True if the port offline status is available and offline. */ bool ata_phys_link_offline(struct ata_link *link) { u32 sstatus; if (sata_scr_read(link, SCR_STATUS, &sstatus) == 0 && !ata_sstatus_online(sstatus)) return true; return false; } /** * ata_link_online - test whether the given link is online * @link: ATA link to test * * Test whether @link is online. This is identical to * ata_phys_link_online() when there's no slave link. When * there's a slave link, this function should only be called on * the master link and will return true if any of M/S links is * online. * * LOCKING: * None. * * RETURNS: * True if the port online status is available and online. */ bool ata_link_online(struct ata_link *link) { struct ata_link *slave = link->ap->slave_link; WARN_ON(link == slave); /* shouldn't be called on slave link */ return ata_phys_link_online(link) || (slave && ata_phys_link_online(slave)); } EXPORT_SYMBOL_GPL(ata_link_online); /** * ata_link_offline - test whether the given link is offline * @link: ATA link to test * * Test whether @link is offline. This is identical to * ata_phys_link_offline() when there's no slave link. When * there's a slave link, this function should only be called on * the master link and will return true if both M/S links are * offline. * * LOCKING: * None. * * RETURNS: * True if the port offline status is available and offline. */ bool ata_link_offline(struct ata_link *link) { struct ata_link *slave = link->ap->slave_link; WARN_ON(link == slave); /* shouldn't be called on slave link */ return ata_phys_link_offline(link) && (!slave || ata_phys_link_offline(slave)); } EXPORT_SYMBOL_GPL(ata_link_offline); #ifdef CONFIG_PM static void ata_port_request_pm(struct ata_port *ap, pm_message_t mesg, unsigned int action, unsigned int ehi_flags, bool async) { struct ata_link *link; unsigned long flags; spin_lock_irqsave(ap->lock, flags); /* * A previous PM operation might still be in progress. Wait for * ATA_PFLAG_PM_PENDING to clear. */ if (ap->pflags & ATA_PFLAG_PM_PENDING) { spin_unlock_irqrestore(ap->lock, flags); ata_port_wait_eh(ap); spin_lock_irqsave(ap->lock, flags); } /* Request PM operation to EH */ ap->pm_mesg = mesg; ap->pflags |= ATA_PFLAG_PM_PENDING; ata_for_each_link(link, ap, HOST_FIRST) { link->eh_info.action |= action; link->eh_info.flags |= ehi_flags; } ata_port_schedule_eh(ap); spin_unlock_irqrestore(ap->lock, flags); if (!async) ata_port_wait_eh(ap); } static void ata_port_suspend(struct ata_port *ap, pm_message_t mesg, bool async) { /* * We are about to suspend the port, so we do not care about * scsi_rescan_device() calls scheduled by previous resume operations. * The next resume will schedule the rescan again. So cancel any rescan * that is not done yet. */ cancel_delayed_work_sync(&ap->scsi_rescan_task); /* * On some hardware, device fails to respond after spun down for * suspend. As the device will not be used until being resumed, we * do not need to touch the device. Ask EH to skip the usual stuff * and proceed directly to suspend. * * http://thread.gmane.org/gmane.linux.ide/46764 */ ata_port_request_pm(ap, mesg, 0, ATA_EHI_QUIET | ATA_EHI_NO_AUTOPSY | ATA_EHI_NO_RECOVERY, async); } static int ata_port_pm_suspend(struct device *dev) { struct ata_port *ap = to_ata_port(dev); if (pm_runtime_suspended(dev)) return 0; ata_port_suspend(ap, PMSG_SUSPEND, false); return 0; } static int ata_port_pm_freeze(struct device *dev) { struct ata_port *ap = to_ata_port(dev); if (pm_runtime_suspended(dev)) return 0; ata_port_suspend(ap, PMSG_FREEZE, false); return 0; } static int ata_port_pm_poweroff(struct device *dev) { if (!pm_runtime_suspended(dev)) ata_port_suspend(to_ata_port(dev), PMSG_HIBERNATE, false); return 0; } static void ata_port_resume(struct ata_port *ap, pm_message_t mesg, bool async) { ata_port_request_pm(ap, mesg, ATA_EH_RESET, ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET, async); } static int ata_port_pm_resume(struct device *dev) { if (!pm_runtime_suspended(dev)) ata_port_resume(to_ata_port(dev), PMSG_RESUME, true); return 0; } /* * For ODDs, the upper layer will poll for media change every few seconds, * which will make it enter and leave suspend state every few seconds. And * as each suspend will cause a hard/soft reset, the gain of runtime suspend * is very little and the ODD may malfunction after constantly being reset. * So the idle callback here will not proceed to suspend if a non-ZPODD capable * ODD is attached to the port. */ static int ata_port_runtime_idle(struct device *dev) { struct ata_port *ap = to_ata_port(dev); struct ata_link *link; struct ata_device *adev; ata_for_each_link(link, ap, HOST_FIRST) { ata_for_each_dev(adev, link, ENABLED) if (adev->class == ATA_DEV_ATAPI && !zpodd_dev_enabled(adev)) return -EBUSY; } return 0; } static int ata_port_runtime_suspend(struct device *dev) { ata_port_suspend(to_ata_port(dev), PMSG_AUTO_SUSPEND, false); return 0; } static int ata_port_runtime_resume(struct device *dev) { ata_port_resume(to_ata_port(dev), PMSG_AUTO_RESUME, false); return 0; } static const struct dev_pm_ops ata_port_pm_ops = { .suspend = ata_port_pm_suspend, .resume = ata_port_pm_resume, .freeze = ata_port_pm_freeze, .thaw = ata_port_pm_resume, .poweroff = ata_port_pm_poweroff, .restore = ata_port_pm_resume, .runtime_suspend = ata_port_runtime_suspend, .runtime_resume = ata_port_runtime_resume, .runtime_idle = ata_port_runtime_idle, }; /* sas ports don't participate in pm runtime management of ata_ports, * and need to resume ata devices at the domain level, not the per-port * level. sas suspend/resume is async to allow parallel port recovery * since sas has multiple ata_port instances per Scsi_Host. */ void ata_sas_port_suspend(struct ata_port *ap) { ata_port_suspend(ap, PMSG_SUSPEND, true); } EXPORT_SYMBOL_GPL(ata_sas_port_suspend); void ata_sas_port_resume(struct ata_port *ap) { ata_port_resume(ap, PMSG_RESUME, true); } EXPORT_SYMBOL_GPL(ata_sas_port_resume); /** * ata_host_suspend - suspend host * @host: host to suspend * @mesg: PM message * * Suspend @host. Actual operation is performed by port suspend. */ void ata_host_suspend(struct ata_host *host, pm_message_t mesg) { host->dev->power.power_state = mesg; } EXPORT_SYMBOL_GPL(ata_host_suspend); /** * ata_host_resume - resume host * @host: host to resume * * Resume @host. Actual operation is performed by port resume. */ void ata_host_resume(struct ata_host *host) { host->dev->power.power_state = PMSG_ON; } EXPORT_SYMBOL_GPL(ata_host_resume); #endif const struct device_type ata_port_type = { .name = ATA_PORT_TYPE_NAME, #ifdef CONFIG_PM .pm = &ata_port_pm_ops, #endif }; /** * ata_dev_init - Initialize an ata_device structure * @dev: Device structure to initialize * * Initialize @dev in preparation for probing. * * LOCKING: * Inherited from caller. */ void ata_dev_init(struct ata_device *dev) { struct ata_link *link = ata_dev_phys_link(dev); struct ata_port *ap = link->ap; unsigned long flags; /* SATA spd limit is bound to the attached device, reset together */ link->sata_spd_limit = link->hw_sata_spd_limit; link->sata_spd = 0; /* High bits of dev->flags are used to record warm plug * requests which occur asynchronously. Synchronize using * host lock. */ spin_lock_irqsave(ap->lock, flags); dev->flags &= ~ATA_DFLAG_INIT_MASK; dev->quirks = 0; spin_unlock_irqrestore(ap->lock, flags); memset((void *)dev + ATA_DEVICE_CLEAR_BEGIN, 0, ATA_DEVICE_CLEAR_END - ATA_DEVICE_CLEAR_BEGIN); dev->pio_mask = UINT_MAX; dev->mwdma_mask = UINT_MAX; dev->udma_mask = UINT_MAX; } /** * ata_link_init - Initialize an ata_link structure * @ap: ATA port link is attached to * @link: Link structure to initialize * @pmp: Port multiplier port number * * Initialize @link. * * LOCKING: * Kernel thread context (may sleep) */ void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp) { int i; /* clear everything except for devices */ memset((void *)link + ATA_LINK_CLEAR_BEGIN, 0, ATA_LINK_CLEAR_END - ATA_LINK_CLEAR_BEGIN); link->ap = ap; link->pmp = pmp; link->active_tag = ATA_TAG_POISON; link->hw_sata_spd_limit = UINT_MAX; /* can't use iterator, ap isn't initialized yet */ for (i = 0; i < ATA_MAX_DEVICES; i++) { struct ata_device *dev = &link->device[i]; dev->link = link; dev->devno = dev - link->device; #ifdef CONFIG_ATA_ACPI dev->gtf_filter = ata_acpi_gtf_filter; #endif ata_dev_init(dev); } } /** * sata_link_init_spd - Initialize link->sata_spd_limit * @link: Link to configure sata_spd_limit for * * Initialize ``link->[hw_]sata_spd_limit`` to the currently * configured value. * * LOCKING: * Kernel thread context (may sleep). * * RETURNS: * 0 on success, -errno on failure. */ int sata_link_init_spd(struct ata_link *link) { u8 spd; int rc; rc = sata_scr_read(link, SCR_CONTROL, &link->saved_scontrol); if (rc) return rc; spd = (link->saved_scontrol >> 4) & 0xf; if (spd) link->hw_sata_spd_limit &= (1 << spd) - 1; ata_force_link_limits(link); link->sata_spd_limit = link->hw_sata_spd_limit; return 0; } /** * ata_port_alloc - allocate and initialize basic ATA port resources * @host: ATA host this allocated port belongs to * * Allocate and initialize basic ATA port resources. * * RETURNS: * Allocate ATA port on success, NULL on failure. * * LOCKING: * Inherited from calling layer (may sleep). */ struct ata_port *ata_port_alloc(struct ata_host *host) { struct ata_port *ap; int id; ap = kzalloc(sizeof(*ap), GFP_KERNEL); if (!ap) return NULL; ap->pflags |= ATA_PFLAG_INITIALIZING | ATA_PFLAG_FROZEN; ap->lock = &host->lock; id = ida_alloc_min(&ata_ida, 1, GFP_KERNEL); if (id < 0) { kfree(ap); return NULL; } ap->print_id = id; ap->host = host; ap->dev = host->dev; mutex_init(&ap->scsi_scan_mutex); INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug); INIT_DELAYED_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan); INIT_LIST_HEAD(&ap->eh_done_q); init_waitqueue_head(&ap->eh_wait_q); init_completion(&ap->park_req_pending); timer_setup(&ap->fastdrain_timer, ata_eh_fastdrain_timerfn, TIMER_DEFERRABLE); ap->cbl = ATA_CBL_NONE; ata_link_init(ap, &ap->link, 0); #ifdef ATA_IRQ_TRAP ap->stats.unhandled_irq = 1; ap->stats.idle_irq = 1; #endif ata_sff_port_init(ap); ata_force_pflags(ap); return ap; } EXPORT_SYMBOL_GPL(ata_port_alloc); void ata_port_free(struct ata_port *ap) { if (!ap) return; kfree(ap->pmp_link); kfree(ap->slave_link); ida_free(&ata_ida, ap->print_id); kfree(ap); } EXPORT_SYMBOL_GPL(ata_port_free); static void ata_devres_release(struct device *gendev, void *res) { struct ata_host *host = dev_get_drvdata(gendev); int i; for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; if (!ap) continue; if (ap->scsi_host) scsi_host_put(ap->scsi_host); } dev_set_drvdata(gendev, NULL); ata_host_put(host); } static void ata_host_release(struct kref *kref) { struct ata_host *host = container_of(kref, struct ata_host, kref); int i; for (i = 0; i < host->n_ports; i++) { ata_port_free(host->ports[i]); host->ports[i] = NULL; } kfree(host); } void ata_host_get(struct ata_host *host) { kref_get(&host->kref); } void ata_host_put(struct ata_host *host) { kref_put(&host->kref, ata_host_release); } EXPORT_SYMBOL_GPL(ata_host_put); /** * ata_host_alloc - allocate and init basic ATA host resources * @dev: generic device this host is associated with * @n_ports: the number of ATA ports associated with this host * * Allocate and initialize basic ATA host resources. LLD calls * this function to allocate a host, initializes it fully and * attaches it using ata_host_register(). * * RETURNS: * Allocate ATA host on success, NULL on failure. * * LOCKING: * Inherited from calling layer (may sleep). */ struct ata_host *ata_host_alloc(struct device *dev, int n_ports) { struct ata_host *host; size_t sz; int i; void *dr; /* alloc a container for our list of ATA ports (buses) */ sz = sizeof(struct ata_host) + n_ports * sizeof(void *); host = kzalloc(sz, GFP_KERNEL); if (!host) return NULL; if (!devres_open_group(dev, NULL, GFP_KERNEL)) { kfree(host); return NULL; } dr = devres_alloc(ata_devres_release, 0, GFP_KERNEL); if (!dr) { kfree(host); goto err_out; } devres_add(dev, dr); dev_set_drvdata(dev, host); spin_lock_init(&host->lock); mutex_init(&host->eh_mutex); host->dev = dev; host->n_ports = n_ports; kref_init(&host->kref); /* allocate ports bound to this host */ for (i = 0; i < n_ports; i++) { struct ata_port *ap; ap = ata_port_alloc(host); if (!ap) goto err_out; ap->port_no = i; host->ports[i] = ap; } devres_remove_group(dev, NULL); return host; err_out: devres_release_group(dev, NULL); return NULL; } EXPORT_SYMBOL_GPL(ata_host_alloc); /** * ata_host_alloc_pinfo - alloc host and init with port_info array * @dev: generic device this host is associated with * @ppi: array of ATA port_info to initialize host with * @n_ports: number of ATA ports attached to this host * * Allocate ATA host and initialize with info from @ppi. If NULL * terminated, @ppi may contain fewer entries than @n_ports. The * last entry will be used for the remaining ports. * * RETURNS: * Allocate ATA host on success, NULL on failure. * * LOCKING: * Inherited from calling layer (may sleep). */ struct ata_host *ata_host_alloc_pinfo(struct device *dev, const struct ata_port_info * const * ppi, int n_ports) { const struct ata_port_info *pi = &ata_dummy_port_info; struct ata_host *host; int i, j; host = ata_host_alloc(dev, n_ports); if (!host) return NULL; for (i = 0, j = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; if (ppi[j]) pi = ppi[j++]; ap->pio_mask = pi->pio_mask; ap->mwdma_mask = pi->mwdma_mask; ap->udma_mask = pi->udma_mask; ap->flags |= pi->flags; ap->link.flags |= pi->link_flags; ap->ops = pi->port_ops; if (!host->ops && (pi->port_ops != &ata_dummy_port_ops)) host->ops = pi->port_ops; } return host; } EXPORT_SYMBOL_GPL(ata_host_alloc_pinfo); static void ata_host_stop(struct device *gendev, void *res) { struct ata_host *host = dev_get_drvdata(gendev); int i; WARN_ON(!(host->flags & ATA_HOST_STARTED)); for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; if (ap->ops->port_stop) ap->ops->port_stop(ap); } if (host->ops->host_stop) host->ops->host_stop(host); } /** * ata_finalize_port_ops - finalize ata_port_operations * @ops: ata_port_operations to finalize * * An ata_port_operations can inherit from another ops and that * ops can again inherit from another. This can go on as many * times as necessary as long as there is no loop in the * inheritance chain. * * Ops tables are finalized when the host is started. NULL or * unspecified entries are inherited from the closet ancestor * which has the method and the entry is populated with it. * After finalization, the ops table directly points to all the * methods and ->inherits is no longer necessary and cleared. * * Using ATA_OP_NULL, inheriting ops can force a method to NULL. * * LOCKING: * None. */ static void ata_finalize_port_ops(struct ata_port_operations *ops) { static DEFINE_SPINLOCK(lock); const struct ata_port_operations *cur; void **begin = (void **)ops; void **end = (void **)&ops->inherits; void **pp; if (!ops || !ops->inherits) return; spin_lock(&lock); for (cur = ops->inherits; cur; cur = cur->inherits) { void **inherit = (void **)cur; for (pp = begin; pp < end; pp++, inherit++) if (!*pp) *pp = *inherit; } for (pp = begin; pp < end; pp++) if (IS_ERR(*pp)) *pp = NULL; ops->inherits = NULL; spin_unlock(&lock); } /** * ata_host_start - start and freeze ports of an ATA host * @host: ATA host to start ports for * * Start and then freeze ports of @host. Started status is * recorded in host->flags, so this function can be called * multiple times. Ports are guaranteed to get started only * once. If host->ops is not initialized yet, it is set to the * first non-dummy port ops. * * LOCKING: * Inherited from calling layer (may sleep). * * RETURNS: * 0 if all ports are started successfully, -errno otherwise. */ int ata_host_start(struct ata_host *host) { int have_stop = 0; void *start_dr = NULL; int i, rc; if (host->flags & ATA_HOST_STARTED) return 0; ata_finalize_port_ops(host->ops); for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; ata_finalize_port_ops(ap->ops); if (!host->ops && !ata_port_is_dummy(ap)) host->ops = ap->ops; if (ap->ops->port_stop) have_stop = 1; } if (host->ops && host->ops->host_stop) have_stop = 1; if (have_stop) { start_dr = devres_alloc(ata_host_stop, 0, GFP_KERNEL); if (!start_dr) return -ENOMEM; } for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; if (ap->ops->port_start) { rc = ap->ops->port_start(ap); if (rc) { if (rc != -ENODEV) dev_err(host->dev, "failed to start port %d (errno=%d)\n", i, rc); goto err_out; } } ata_eh_freeze_port(ap); } if (start_dr) devres_add(host->dev, start_dr); host->flags |= ATA_HOST_STARTED; return 0; err_out: while (--i >= 0) { struct ata_port *ap = host->ports[i]; if (ap->ops->port_stop) ap->ops->port_stop(ap); } devres_free(start_dr); return rc; } EXPORT_SYMBOL_GPL(ata_host_start); /** * ata_host_init - Initialize a host struct for sas (ipr, libsas) * @host: host to initialize * @dev: device host is attached to * @ops: port_ops * */ void ata_host_init(struct ata_host *host, struct device *dev, struct ata_port_operations *ops) { spin_lock_init(&host->lock); mutex_init(&host->eh_mutex); host->n_tags = ATA_MAX_QUEUE; host->dev = dev; host->ops = ops; kref_init(&host->kref); } EXPORT_SYMBOL_GPL(ata_host_init); void ata_port_probe(struct ata_port *ap) { struct ata_eh_info *ehi = &ap->link.eh_info; unsigned long flags; /* kick EH for boot probing */ spin_lock_irqsave(ap->lock, flags); ehi->probe_mask |= ATA_ALL_DEVICES; ehi->action |= ATA_EH_RESET; ehi->flags |= ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET; ap->pflags &= ~ATA_PFLAG_INITIALIZING; ap->pflags |= ATA_PFLAG_LOADING; ata_port_schedule_eh(ap); spin_unlock_irqrestore(ap->lock, flags); } EXPORT_SYMBOL_GPL(ata_port_probe); static void async_port_probe(void *data, async_cookie_t cookie) { struct ata_port *ap = data; /* * If we're not allowed to scan this host in parallel, * we need to wait until all previous scans have completed * before going further. * Jeff Garzik says this is only within a controller, so we * don't need to wait for port 0, only for later ports. */ if (!(ap->host->flags & ATA_HOST_PARALLEL_SCAN) && ap->port_no != 0) async_synchronize_cookie(cookie); ata_port_probe(ap); ata_port_wait_eh(ap); /* in order to keep device order, we need to synchronize at this point */ async_synchronize_cookie(cookie); ata_scsi_scan_host(ap, 1); } /** * ata_host_register - register initialized ATA host * @host: ATA host to register * @sht: template for SCSI host * * Register initialized ATA host. @host is allocated using * ata_host_alloc() and fully initialized by LLD. This function * starts ports, registers @host with ATA and SCSI layers and * probe registered devices. * * LOCKING: * Inherited from calling layer (may sleep). * * RETURNS: * 0 on success, -errno otherwise. */ int ata_host_register(struct ata_host *host, const struct scsi_host_template *sht) { int i, rc; host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE); /* host must have been started */ if (!(host->flags & ATA_HOST_STARTED)) { dev_err(host->dev, "BUG: trying to register unstarted host\n"); WARN_ON(1); return -EINVAL; } /* Create associated sysfs transport objects */ for (i = 0; i < host->n_ports; i++) { rc = ata_tport_add(host->dev,host->ports[i]); if (rc) { goto err_tadd; } } rc = ata_scsi_add_hosts(host, sht); if (rc) goto err_tadd; /* set cable, sata_spd_limit and report */ for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; unsigned int xfer_mask; /* set SATA cable type if still unset */ if (ap->cbl == ATA_CBL_NONE && (ap->flags & ATA_FLAG_SATA)) ap->cbl = ATA_CBL_SATA; /* init sata_spd_limit to the current value */ sata_link_init_spd(&ap->link); if (ap->slave_link) sata_link_init_spd(ap->slave_link); /* print per-port info to dmesg */ xfer_mask = ata_pack_xfermask(ap->pio_mask, ap->mwdma_mask, ap->udma_mask); if (!ata_port_is_dummy(ap)) { ata_port_info(ap, "%cATA max %s %s\n", (ap->flags & ATA_FLAG_SATA) ? 'S' : 'P', ata_mode_string(xfer_mask), ap->link.eh_info.desc); ata_ehi_clear_desc(&ap->link.eh_info); } else ata_port_info(ap, "DUMMY\n"); } /* perform each probe asynchronously */ for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; ap->cookie = async_schedule(async_port_probe, ap); } return 0; err_tadd: while (--i >= 0) { ata_tport_delete(host->ports[i]); } return rc; } EXPORT_SYMBOL_GPL(ata_host_register); /** * ata_host_activate - start host, request IRQ and register it * @host: target ATA host * @irq: IRQ to request * @irq_handler: irq_handler used when requesting IRQ * @irq_flags: irq_flags used when requesting IRQ * @sht: scsi_host_template to use when registering the host * * After allocating an ATA host and initializing it, most libata * LLDs perform three steps to activate the host - start host, * request IRQ and register it. This helper takes necessary * arguments and performs the three steps in one go. * * An invalid IRQ skips the IRQ registration and expects the host to * have set polling mode on the port. In this case, @irq_handler * should be NULL. * * LOCKING: * Inherited from calling layer (may sleep). * * RETURNS: * 0 on success, -errno otherwise. */ int ata_host_activate(struct ata_host *host, int irq, irq_handler_t irq_handler, unsigned long irq_flags, const struct scsi_host_template *sht) { int i, rc; char *irq_desc; rc = ata_host_start(host); if (rc) return rc; /* Special case for polling mode */ if (!irq) { WARN_ON(irq_handler); return ata_host_register(host, sht); } irq_desc = devm_kasprintf(host->dev, GFP_KERNEL, "%s[%s]", dev_driver_string(host->dev), dev_name(host->dev)); if (!irq_desc) return -ENOMEM; rc = devm_request_irq(host->dev, irq, irq_handler, irq_flags, irq_desc, host); if (rc) return rc; for (i = 0; i < host->n_ports; i++) ata_port_desc_misc(host->ports[i], irq); rc = ata_host_register(host, sht); /* if failed, just free the IRQ and leave ports alone */ if (rc) devm_free_irq(host->dev, irq, host); return rc; } EXPORT_SYMBOL_GPL(ata_host_activate); /** * ata_dev_free_resources - Free a device resources * @dev: Target ATA device * * Free resources allocated to support a device features. * * LOCKING: * Kernel thread context (may sleep). */ void ata_dev_free_resources(struct ata_device *dev) { if (zpodd_dev_enabled(dev)) zpodd_exit(dev); ata_dev_cleanup_cdl_resources(dev); } /** * ata_port_detach - Detach ATA port in preparation of device removal * @ap: ATA port to be detached * * Detach all ATA devices and the associated SCSI devices of @ap; * then, remove the associated SCSI host. @ap is guaranteed to * be quiescent on return from this function. * * LOCKING: * Kernel thread context (may sleep). */ static void ata_port_detach(struct ata_port *ap) { unsigned long flags; struct ata_link *link; struct ata_device *dev; /* Ensure ata_port probe has completed */ async_synchronize_cookie(ap->cookie + 1); /* Wait for any ongoing EH */ ata_port_wait_eh(ap); mutex_lock(&ap->scsi_scan_mutex); spin_lock_irqsave(ap->lock, flags); /* Remove scsi devices */ ata_for_each_link(link, ap, HOST_FIRST) { ata_for_each_dev(dev, link, ALL) { if (dev->sdev) { spin_unlock_irqrestore(ap->lock, flags); scsi_remove_device(dev->sdev); spin_lock_irqsave(ap->lock, flags); dev->sdev = NULL; } } } /* Tell EH to disable all devices */ ap->pflags |= ATA_PFLAG_UNLOADING; ata_port_schedule_eh(ap); spin_unlock_irqrestore(ap->lock, flags); mutex_unlock(&ap->scsi_scan_mutex); /* wait till EH commits suicide */ ata_port_wait_eh(ap); /* it better be dead now */ WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED)); cancel_delayed_work_sync(&ap->hotplug_task); cancel_delayed_work_sync(&ap->scsi_rescan_task); /* Delete port multiplier link transport devices */ if (ap->pmp_link) { int i; for (i = 0; i < SATA_PMP_MAX_PORTS; i++) ata_tlink_delete(&ap->pmp_link[i]); } /* Remove the associated SCSI host */ scsi_remove_host(ap->scsi_host); ata_tport_delete(ap); } /** * ata_host_detach - Detach all ports of an ATA host * @host: Host to detach * * Detach all ports of @host. * * LOCKING: * Kernel thread context (may sleep). */ void ata_host_detach(struct ata_host *host) { int i; for (i = 0; i < host->n_ports; i++) ata_port_detach(host->ports[i]); /* the host is dead now, dissociate ACPI */ ata_acpi_dissociate(host); } EXPORT_SYMBOL_GPL(ata_host_detach); #ifdef CONFIG_PCI /** * ata_pci_remove_one - PCI layer callback for device removal * @pdev: PCI device that was removed * * PCI layer indicates to libata via this hook that hot-unplug or * module unload event has occurred. Detach all ports. Resource * release is handled via devres. * * LOCKING: * Inherited from PCI layer (may sleep). */ void ata_pci_remove_one(struct pci_dev *pdev) { struct ata_host *host = pci_get_drvdata(pdev); ata_host_detach(host); } EXPORT_SYMBOL_GPL(ata_pci_remove_one); void ata_pci_shutdown_one(struct pci_dev *pdev) { struct ata_host *host = pci_get_drvdata(pdev); int i; for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; ap->pflags |= ATA_PFLAG_FROZEN; /* Disable port interrupts */ if (ap->ops->freeze) ap->ops->freeze(ap); /* Stop the port DMA engines */ if (ap->ops->port_stop) ap->ops->port_stop(ap); } } EXPORT_SYMBOL_GPL(ata_pci_shutdown_one); /* move to PCI subsystem */ int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits) { unsigned long tmp = 0; switch (bits->width) { case 1: { u8 tmp8 = 0; pci_read_config_byte(pdev, bits->reg, &tmp8); tmp = tmp8; break; } case 2: { u16 tmp16 = 0; pci_read_config_word(pdev, bits->reg, &tmp16); tmp = tmp16; break; } case 4: { u32 tmp32 = 0; pci_read_config_dword(pdev, bits->reg, &tmp32); tmp = tmp32; break; } default: return -EINVAL; } tmp &= bits->mask; return (tmp == bits->val) ? 1 : 0; } EXPORT_SYMBOL_GPL(pci_test_config_bits); #ifdef CONFIG_PM void ata_pci_device_do_suspend(struct pci_dev *pdev, pm_message_t mesg) { pci_save_state(pdev); pci_disable_device(pdev); if (mesg.event & PM_EVENT_SLEEP) pci_set_power_state(pdev, PCI_D3hot); } EXPORT_SYMBOL_GPL(ata_pci_device_do_suspend); int ata_pci_device_do_resume(struct pci_dev *pdev) { int rc; pci_set_power_state(pdev, PCI_D0); pci_restore_state(pdev); rc = pcim_enable_device(pdev); if (rc) { dev_err(&pdev->dev, "failed to enable device after resume (%d)\n", rc); return rc; } pci_set_master(pdev); return 0; } EXPORT_SYMBOL_GPL(ata_pci_device_do_resume); int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg) { struct ata_host *host = pci_get_drvdata(pdev); ata_host_suspend(host, mesg); ata_pci_device_do_suspend(pdev, mesg); return 0; } EXPORT_SYMBOL_GPL(ata_pci_device_suspend); int ata_pci_device_resume(struct pci_dev *pdev) { struct ata_host *host = pci_get_drvdata(pdev); int rc; rc = ata_pci_device_do_resume(pdev); if (rc == 0) ata_host_resume(host); return rc; } EXPORT_SYMBOL_GPL(ata_pci_device_resume); #endif /* CONFIG_PM */ #endif /* CONFIG_PCI */ /** * ata_platform_remove_one - Platform layer callback for device removal * @pdev: Platform device that was removed * * Platform layer indicates to libata via this hook that hot-unplug or * module unload event has occurred. Detach all ports. Resource * release is handled via devres. * * LOCKING: * Inherited from platform layer (may sleep). */ void ata_platform_remove_one(struct platform_device *pdev) { struct ata_host *host = platform_get_drvdata(pdev); ata_host_detach(host); } EXPORT_SYMBOL_GPL(ata_platform_remove_one); #ifdef CONFIG_ATA_FORCE #define force_cbl(name, flag) \ { #name, .cbl = (flag) } #define force_spd_limit(spd, val) \ { #spd, .spd_limit = (val) } #define force_xfer(mode, shift) \ { #mode, .xfer_mask = (1UL << (shift)) } #define force_lflag_on(name, flags) \ { #name, .lflags_on = (flags) } #define force_lflag_onoff(name, flags) \ { "no" #name, .lflags_on = (flags) }, \ { #name, .lflags_off = (flags) } #define force_pflag_on(name, flags) \ { #name, .pflags_on = (flags) } #define force_quirk_on(name, flag) \ { #name, .quirk_on = (flag) } #define force_quirk_onoff(name, flag) \ { "no" #name, .quirk_on = (flag) }, \ { #name, .quirk_off = (flag) } static const struct ata_force_param force_tbl[] __initconst = { force_cbl(40c, ATA_CBL_PATA40), force_cbl(80c, ATA_CBL_PATA80), force_cbl(short40c, ATA_CBL_PATA40_SHORT), force_cbl(unk, ATA_CBL_PATA_UNK), force_cbl(ign, ATA_CBL_PATA_IGN), force_cbl(sata, ATA_CBL_SATA), force_spd_limit(1.5Gbps, 1), force_spd_limit(3.0Gbps, 2), force_xfer(pio0, ATA_SHIFT_PIO + 0), force_xfer(pio1, ATA_SHIFT_PIO + 1), force_xfer(pio2, ATA_SHIFT_PIO + 2), force_xfer(pio3, ATA_SHIFT_PIO + 3), force_xfer(pio4, ATA_SHIFT_PIO + 4), force_xfer(pio5, ATA_SHIFT_PIO + 5), force_xfer(pio6, ATA_SHIFT_PIO + 6), force_xfer(mwdma0, ATA_SHIFT_MWDMA + 0), force_xfer(mwdma1, ATA_SHIFT_MWDMA + 1), force_xfer(mwdma2, ATA_SHIFT_MWDMA + 2), force_xfer(mwdma3, ATA_SHIFT_MWDMA + 3), force_xfer(mwdma4, ATA_SHIFT_MWDMA + 4), force_xfer(udma0, ATA_SHIFT_UDMA + 0), force_xfer(udma16, ATA_SHIFT_UDMA + 0), force_xfer(udma/16, ATA_SHIFT_UDMA + 0), force_xfer(udma1, ATA_SHIFT_UDMA + 1), force_xfer(udma25, ATA_SHIFT_UDMA + 1), force_xfer(udma/25, ATA_SHIFT_UDMA + 1), force_xfer(udma2, ATA_SHIFT_UDMA + 2), force_xfer(udma33, ATA_SHIFT_UDMA + 2), force_xfer(udma/33, ATA_SHIFT_UDMA + 2), force_xfer(udma3, ATA_SHIFT_UDMA + 3), force_xfer(udma44, ATA_SHIFT_UDMA + 3), force_xfer(udma/44, ATA_SHIFT_UDMA + 3), force_xfer(udma4, ATA_SHIFT_UDMA + 4), force_xfer(udma66, ATA_SHIFT_UDMA + 4), force_xfer(udma/66, ATA_SHIFT_UDMA + 4), force_xfer(udma5, ATA_SHIFT_UDMA + 5), force_xfer(udma100, ATA_SHIFT_UDMA + 5), force_xfer(udma/100, ATA_SHIFT_UDMA + 5), force_xfer(udma6, ATA_SHIFT_UDMA + 6), force_xfer(udma133, ATA_SHIFT_UDMA + 6), force_xfer(udma/133, ATA_SHIFT_UDMA + 6), force_xfer(udma7, ATA_SHIFT_UDMA + 7), force_lflag_on(nohrst, ATA_LFLAG_NO_HRST), force_lflag_on(nosrst, ATA_LFLAG_NO_SRST), force_lflag_on(norst, ATA_LFLAG_NO_HRST | ATA_LFLAG_NO_SRST), force_lflag_on(rstonce, ATA_LFLAG_RST_ONCE), force_lflag_onoff(dbdelay, ATA_LFLAG_NO_DEBOUNCE_DELAY), force_pflag_on(external, ATA_PFLAG_EXTERNAL), force_quirk_onoff(ncq, ATA_QUIRK_NONCQ), force_quirk_onoff(ncqtrim, ATA_QUIRK_NO_NCQ_TRIM), force_quirk_onoff(ncqati, ATA_QUIRK_NO_NCQ_ON_ATI), force_quirk_onoff(trim, ATA_QUIRK_NOTRIM), force_quirk_on(trim_zero, ATA_QUIRK_ZERO_AFTER_TRIM), force_quirk_on(max_trim_128m, ATA_QUIRK_MAX_TRIM_128M), force_quirk_onoff(dma, ATA_QUIRK_NODMA), force_quirk_on(atapi_dmadir, ATA_QUIRK_ATAPI_DMADIR), force_quirk_on(atapi_mod16_dma, ATA_QUIRK_ATAPI_MOD16_DMA), force_quirk_onoff(dmalog, ATA_QUIRK_NO_DMA_LOG), force_quirk_onoff(iddevlog, ATA_QUIRK_NO_ID_DEV_LOG), force_quirk_onoff(logdir, ATA_QUIRK_NO_LOG_DIR), force_quirk_on(max_sec_128, ATA_QUIRK_MAX_SEC_128), force_quirk_on(max_sec_1024, ATA_QUIRK_MAX_SEC_1024), force_quirk_on(max_sec_lba48, ATA_QUIRK_MAX_SEC_LBA48), force_quirk_onoff(lpm, ATA_QUIRK_NOLPM), force_quirk_onoff(setxfer, ATA_QUIRK_NOSETXFER), force_quirk_on(dump_id, ATA_QUIRK_DUMP_ID), force_quirk_onoff(fua, ATA_QUIRK_NO_FUA), force_quirk_on(disable, ATA_QUIRK_DISABLE), }; static int __init ata_parse_force_one(char **cur, struct ata_force_ent *force_ent, const char **reason) { char *start = *cur, *p = *cur; char *id, *val, *endp; const struct ata_force_param *match_fp = NULL; int nr_matches = 0, i; /* find where this param ends and update *cur */ while (*p != '\0' && *p != ',') p++; if (*p == '\0') *cur = p; else *cur = p + 1; *p = '\0'; /* parse */ p = strchr(start, ':'); if (!p) { val = strstrip(start); goto parse_val; } *p = '\0'; id = strstrip(start); val = strstrip(p + 1); /* parse id */ p = strchr(id, '.'); if (p) { *p++ = '\0'; force_ent->device = simple_strtoul(p, &endp, 10); if (p == endp || *endp != '\0') { *reason = "invalid device"; return -EINVAL; } } force_ent->port = simple_strtoul(id, &endp, 10); if (id == endp || *endp != '\0') { *reason = "invalid port/link"; return -EINVAL; } parse_val: /* parse val, allow shortcuts so that both 1.5 and 1.5Gbps work */ for (i = 0; i < ARRAY_SIZE(force_tbl); i++) { const struct ata_force_param *fp = &force_tbl[i]; if (strncasecmp(val, fp->name, strlen(val))) continue; nr_matches++; match_fp = fp; if (strcasecmp(val, fp->name) == 0) { nr_matches = 1; break; } } if (!nr_matches) { *reason = "unknown value"; return -EINVAL; } if (nr_matches > 1) { *reason = "ambiguous value"; return -EINVAL; } force_ent->param = *match_fp; return 0; } static void __init ata_parse_force_param(void) { int idx = 0, size = 1; int last_port = -1, last_device = -1; char *p, *cur, *next; /* Calculate maximum number of params and allocate ata_force_tbl */ for (p = ata_force_param_buf; *p; p++) if (*p == ',') size++; ata_force_tbl = kcalloc(size, sizeof(ata_force_tbl[0]), GFP_KERNEL); if (!ata_force_tbl) { printk(KERN_WARNING "ata: failed to extend force table, " "libata.force ignored\n"); return; } /* parse and populate the table */ for (cur = ata_force_param_buf; *cur != '\0'; cur = next) { const char *reason = ""; struct ata_force_ent te = { .port = -1, .device = -1 }; next = cur; if (ata_parse_force_one(&next, &te, &reason)) { printk(KERN_WARNING "ata: failed to parse force " "parameter \"%s\" (%s)\n", cur, reason); continue; } if (te.port == -1) { te.port = last_port; te.device = last_device; } ata_force_tbl[idx++] = te; last_port = te.port; last_device = te.device; } ata_force_tbl_size = idx; } static void ata_free_force_param(void) { kfree(ata_force_tbl); } #else static inline void ata_parse_force_param(void) { } static inline void ata_free_force_param(void) { } #endif static int __init ata_init(void) { int rc; ata_parse_force_param(); rc = ata_sff_init(); if (rc) { ata_free_force_param(); return rc; } libata_transport_init(); ata_scsi_transport_template = ata_attach_transport(); if (!ata_scsi_transport_template) { ata_sff_exit(); rc = -ENOMEM; goto err_out; } printk(KERN_DEBUG "libata version " DRV_VERSION " loaded.\n"); return 0; err_out: return rc; } static void __exit ata_exit(void) { ata_release_transport(ata_scsi_transport_template); libata_transport_exit(); ata_sff_exit(); ata_free_force_param(); } subsys_initcall(ata_init); module_exit(ata_exit); static DEFINE_RATELIMIT_STATE(ratelimit, HZ / 5, 1); int ata_ratelimit(void) { return __ratelimit(&ratelimit); } EXPORT_SYMBOL_GPL(ata_ratelimit); /** * ata_msleep - ATA EH owner aware msleep * @ap: ATA port to attribute the sleep to * @msecs: duration to sleep in milliseconds * * Sleeps @msecs. If the current task is owner of @ap's EH, the * ownership is released before going to sleep and reacquired * after the sleep is complete. IOW, other ports sharing the * @ap->host will be allowed to own the EH while this task is * sleeping. * * LOCKING: * Might sleep. */ void ata_msleep(struct ata_port *ap, unsigned int msecs) { bool owns_eh = ap && ap->host->eh_owner == current; if (owns_eh) ata_eh_release(ap); if (msecs < 20) { unsigned long usecs = msecs * USEC_PER_MSEC; usleep_range(usecs, usecs + 50); } else { msleep(msecs); } if (owns_eh) ata_eh_acquire(ap); } EXPORT_SYMBOL_GPL(ata_msleep); /** * ata_wait_register - wait until register value changes * @ap: ATA port to wait register for, can be NULL * @reg: IO-mapped register * @mask: Mask to apply to read register value * @val: Wait condition * @interval: polling interval in milliseconds * @timeout: timeout in milliseconds * * Waiting for some bits of register to change is a common * operation for ATA controllers. This function reads 32bit LE * IO-mapped register @reg and tests for the following condition. * * (*@reg & mask) != val * * If the condition is met, it returns; otherwise, the process is * repeated after @interval_msec until timeout. * * LOCKING: * Kernel thread context (may sleep) * * RETURNS: * The final register value. */ u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask, u32 val, unsigned int interval, unsigned int timeout) { unsigned long deadline; u32 tmp; tmp = ioread32(reg); /* Calculate timeout _after_ the first read to make sure * preceding writes reach the controller before starting to * eat away the timeout. */ deadline = ata_deadline(jiffies, timeout); while ((tmp & mask) == val && time_before(jiffies, deadline)) { ata_msleep(ap, interval); tmp = ioread32(reg); } return tmp; } EXPORT_SYMBOL_GPL(ata_wait_register); /* * Dummy port_ops */ static unsigned int ata_dummy_qc_issue(struct ata_queued_cmd *qc) { return AC_ERR_SYSTEM; } static void ata_dummy_error_handler(struct ata_port *ap) { /* truly dummy */ } struct ata_port_operations ata_dummy_port_ops = { .qc_issue = ata_dummy_qc_issue, .error_handler = ata_dummy_error_handler, .sched_eh = ata_std_sched_eh, .end_eh = ata_std_end_eh, }; EXPORT_SYMBOL_GPL(ata_dummy_port_ops); const struct ata_port_info ata_dummy_port_info = { .port_ops = &ata_dummy_port_ops, }; EXPORT_SYMBOL_GPL(ata_dummy_port_info); EXPORT_TRACEPOINT_SYMBOL_GPL(ata_tf_load); EXPORT_TRACEPOINT_SYMBOL_GPL(ata_exec_command); EXPORT_TRACEPOINT_SYMBOL_GPL(ata_bmdma_setup); EXPORT_TRACEPOINT_SYMBOL_GPL(ata_bmdma_start); EXPORT_TRACEPOINT_SYMBOL_GPL(ata_bmdma_status);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ocfs2.h * * Defines macros and structures used in OCFS2 * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef OCFS2_H #define OCFS2_H #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/lockdep.h> #include <linux/jbd2.h> /* For union ocfs2_dlm_lksb */ #include "stackglue.h" #include "ocfs2_fs.h" #include "ocfs2_lockid.h" #include "ocfs2_ioctl.h" /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" #include "reservations.h" #include "filecheck.h" /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small * amount of inlined blocks to be stored on an array and grow the * structure into a rb tree when necessary. */ #define OCFS2_CACHE_INFO_MAX_ARRAY 2 /* Flags for ocfs2_caching_info */ enum ocfs2_caching_info_flags { /* Indicates that the metadata cache is using the inline array */ OCFS2_CACHE_FL_INLINE = 1<<1, }; struct ocfs2_caching_operations; struct ocfs2_caching_info { /* * The parent structure provides the locks, but because the * parent structure can differ, it provides locking operations * to struct ocfs2_caching_info. */ const struct ocfs2_caching_operations *ci_ops; /* next two are protected by trans_inc_lock */ /* which transaction were we created on? Zero if none. */ unsigned long ci_created_trans; /* last transaction we were a part of. */ unsigned long ci_last_trans; /* Cache structures */ unsigned int ci_flags; unsigned int ci_num_cached; union { sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; struct rb_root ci_tree; } ci_cache; }; /* * Need this prototype here instead of in uptodate.h because journal.h * uses it. */ struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); /* this limits us to 256 nodes * if we need more, we can do a kmalloc for the map */ #define OCFS2_NODE_MAP_MAX_NODES 256 struct ocfs2_node_map { u16 num_nodes; unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; }; enum ocfs2_ast_action { OCFS2_AST_INVALID = 0, OCFS2_AST_ATTACH, OCFS2_AST_CONVERT, OCFS2_AST_DOWNCONVERT, }; /* actions for an unlockast function to take. */ enum ocfs2_unlock_action { OCFS2_UNLOCK_INVALID = 0, OCFS2_UNLOCK_CANCEL_CONVERT, OCFS2_UNLOCK_DROP_LOCK, }; /* ocfs2_lock_res->l_flags flags. */ #define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized * the lvb */ #define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in * dlm_lock */ #define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to * downconvert*/ #define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ #define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) #define OCFS2_LOCK_REFRESHING (0x00000020) #define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization * for shutdown paths */ #define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track * when to skip queueing * a lock because it's * about to be * dropped. */ #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ #define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ #define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a call to dlm_lock. Only exists with BUSY set. */ #define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread * from downconverting * before the upconvert * has completed */ #define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster * lock has already * returned, do not block * dc thread from * downconverting */ struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats { u64 ls_total; /* Total wait in NSEC */ u32 ls_gets; /* Num acquires */ u32 ls_fail; /* Num failed acquires */ /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ u64 ls_last; /* Last unlock time in USEC */ }; #endif struct ocfs2_lock_res { void *l_priv; const struct ocfs2_lock_res_ops *l_ops; struct list_head l_blocked_list; struct list_head l_mask_waiters; struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; signed char l_level; signed char l_requested; signed char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; /* used from AST/BAST funcs. */ /* Data packed - enum type ocfs2_ast_action */ unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; unsigned int l_pending_gen; spinlock_t l_lock; struct ocfs2_dlm_lksb l_lksb; wait_queue_head_t l_event; struct list_head l_debug_list; #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ u32 l_lock_refresh; /* Disk refreshes */ u64 l_lock_wait; /* First lock wait time */ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map l_lockdep_map; #endif }; enum ocfs2_orphan_reco_type { ORPHAN_NO_NEED_TRUNCATE = 0, ORPHAN_NEED_TRUNCATE, }; enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE }; struct ocfs2_orphan_scan { struct mutex os_lock; struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ }; struct ocfs2_dlm_debug { struct kref d_refcnt; u32 d_filter_secs; struct list_head d_lockres_tracking; }; enum ocfs2_vol_state { VOLUME_INIT = 0, VOLUME_MOUNTED, VOLUME_MOUNTED_QUOTAS, VOLUME_DISMOUNTED, VOLUME_DISABLED }; struct ocfs2_alloc_stats { atomic_t moves; atomic_t local_data; atomic_t bitmap_data; atomic_t bg_allocs; atomic_t bg_extends; }; enum ocfs2_local_alloc_state { OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for * this mountpoint. */ OCFS2_LA_ENABLED, /* Local alloc is in use. */ OCFS2_LA_THROTTLED, /* Local alloc is in use, but number * of bits has been reduced. */ OCFS2_LA_DISABLED /* Local alloc has temporarily been * disabled. */ }; enum ocfs2_mount_options { OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT writes */ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 #define OCFS2_OSB_HARD_RO 0x0002 #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_triggers { struct jbd2_buffer_trigger_type ot_triggers; int ot_offset; struct super_block *sb; }; enum ocfs2_journal_trigger_type { OCFS2_JTR_DI, OCFS2_JTR_EB, OCFS2_JTR_RB, OCFS2_JTR_GD, OCFS2_JTR_DB, OCFS2_JTR_XB, OCFS2_JTR_DQ, OCFS2_JTR_DR, OCFS2_JTR_DL, OCFS2_JTR_NONE /* This must be the last entry */ }; #define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE void ocfs2_initialize_journal_triggers(struct super_block *sb, struct ocfs2_triggers triggers[]); enum ocfs2_recovery_state { OCFS2_REC_ENABLED = 0, OCFS2_REC_QUOTA_WANT_DISABLE, /* * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for * ocfs2_recovery_disable_quota() to work. */ OCFS2_REC_QUOTA_DISABLED, OCFS2_REC_WANT_DISABLE, /* * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work */ OCFS2_REC_DISABLED, }; struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; struct ocfs2_super { struct task_struct *commit_task; struct super_block *sb; struct inode *root_inode; struct inode *sys_root_inode; struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; struct inode **local_system_inodes; struct ocfs2_slot_info *slot_info; u32 *slot_recovery_generations; spinlock_t node_map_lock; u64 root_blkno; u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; char *uuid_str; u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; u32 s_feature_compat; u32 s_feature_incompat; u32 s_feature_ro_compat; /* Protects s_next_generation, osb_flags and s_inode_steal_slot. * Could protect more on osb as it's very short lived. */ spinlock_t osb_lock; u32 s_next_generation; unsigned long osb_flags; u16 s_inode_steal_slot; u16 s_meta_steal_slot; atomic_t s_num_inodes_stolen; atomic_t s_num_meta_stolen; unsigned long s_mount_opt; unsigned int s_atime_quantum; unsigned int max_slots; unsigned int node_num; int slot_num; int preferred_slot; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; /* Journal triggers for checksum */ struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; struct delayed_work la_enable_wq; /* * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; unsigned int local_alloc_default_bits; /* osb_clusters_at_boot can become stale! Do not trust it to * be up to date. */ unsigned int osb_clusters_at_boot; enum ocfs2_local_alloc_state local_alloc_state; /* protected * by osb_lock */ struct buffer_head *local_alloc_bh; u64 la_last_gd; struct ocfs2_reservation_map osb_la_resmap; unsigned int osb_resv_level; unsigned int osb_dir_resv_level; /* Next two fields are for local node slot recovery during * mount. */ struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; struct ocfs2_blockcheck_stats osb_ecc_stats; struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct rw_semaphore nfs_sync_rwlock; struct ocfs2_lock_res osb_trim_fs_lockres; struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; wait_queue_head_t recovery_event; spinlock_t dc_task_lock; struct task_struct *dc_task; wait_queue_head_t dc_event; unsigned long dc_wake_sequence; unsigned long dc_work_sequence; /* * Any thread can add locks to the list, but the downconvert * thread is the only one allowed to remove locks. Any change * to this rule requires updating * ocfs2_downconvert_thread_do_work(). */ struct list_head blocked_lock_list; unsigned long blocked_lock_count; /* List of dquot structures to drop last reference to */ struct llist_head dquot_drop_list; struct work_struct dquot_drop_work; wait_queue_head_t osb_mount_event; /* Truncate log info */ struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; atomic_t osb_tl_disable; /* * How many clusters in our truncate log. * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; struct ocfs2_orphan_scan osb_orphan_scan; /* used to protect metaecc calculation check of xattr. */ spinlock_t osb_xattr_lock; unsigned int osb_dx_mask; u32 osb_dx_seed[4]; /* the group we used to allocate inodes. */ u64 osb_inode_alloc_group; /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; struct mutex system_file_mutex; /* * OCFS2 needs to schedule several different types of work which * require cluster locking, disk I/O, recovery waits, etc. Since these * types of work tend to be heavy we avoid using the kernel events * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; /* sysfs directory per partition */ struct kset *osb_dev_kset; /* file check related stuff */ struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) /* Useful typedef for passing around journal access functions */ typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); static inline int ocfs2_should_order_data(struct inode *inode) { if (!S_ISREG(inode->i_mode)) return 0; if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) return 0; return 1; } static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) return 1; return 0; } static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) { /* * Support for sparse files is a pre-requisite */ if (!ocfs2_sparse_alloc(osb)) return 0; if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) return 1; return 0; } static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) return 1; return 0; } static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) return 1; return 0; } static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) return 1; return 0; } static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) return 1; return 0; } static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) return 1; return 0; } static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) return 1; return 0; } static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) { if (ocfs2_supports_indexed_dirs(osb)) return OCFS2_DX_LINK_MAX; return OCFS2_LINK_MAX; } static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) { u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) { u16 lo, hi; lo = nlink; hi = nlink >> OCFS2_LINKS_HI_SHIFT; di->i_links_count = cpu_to_le16(lo); di->i_links_count_hi = cpu_to_le16(hi); } static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) { u32 links = ocfs2_read_links_count(di); links += n; ocfs2_set_links_count(di, links); } static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) return 1; return 0; } /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock * too! */ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, unsigned long flag) { spin_lock(&osb->osb_lock); osb->osb_flags |= flag; spin_unlock(&osb->osb_lock); } static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { spin_lock(&osb->osb_lock); osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); if (hard) osb->osb_flags |= OCFS2_OSB_HARD_RO; else osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); } static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_HARD_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) { return (osb->s_feature_incompat & (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); } static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) { return ocfs2_o2cb_stack(osb) && (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); } static inline int ocfs2_mount_local(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); } #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) #define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) #define OCFS2_IS_VALID_DX_ROOT(ptr) \ (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) #define OCFS2_IS_VALID_DX_LEAF(ptr) \ (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) #define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { return (unsigned long)(blkno & (u64)ULONG_MAX); } static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, u32 clusters) { int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u64)clusters << c_to_b_bits; } static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; blocks += (1 << b_to_c_bits) - 1; return (u32)(blocks >> b_to_c_bits); } static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u32)(blocks >> b_to_c_bits); } static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; bytes += OCFS2_SB(sb)->s_clustersize - 1; /* OCFS2 just cannot have enough clusters to overflow this */ clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { bytes += sb->s_blocksize - 1; return bytes >> sb->s_blocksize_bits; } static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, u32 clusters) { return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; } static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, u64 blocks) { int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; unsigned int clusters; clusters = ocfs2_blocks_to_clusters(sb, blocks); return (u64)clusters << bits; } static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = ocfs2_clusters_for_bytes(sb, bytes); return (u64)clusters << cl_bits; } static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, u64 bytes) { u64 blocks; blocks = ocfs2_blocks_for_bytes(sb, bytes); return blocks << sb->s_blocksize_bits; } static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) { return (unsigned long)((bytes + 511) >> 9); } static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, unsigned long pg_index) { u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; if (unlikely(PAGE_SHIFT > cbits)) clusters = pg_index << (PAGE_SHIFT - cbits); else if (PAGE_SHIFT < cbits) clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } /* * Find the 1st page index which covers the given clusters. */ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, u32 clusters) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; if (PAGE_SHIFT > cbits) { index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); } else if (PAGE_SHIFT < cbits) { index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; } static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; if (PAGE_SHIFT < cbits) pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, unsigned int megs) { BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, unsigned int clusters) { return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { __set_bit_le(bit, bitmap); } #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) { __clear_bit_le(bit, bitmap); } #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) #define ocfs2_test_bit test_bit_le #define ocfs2_find_next_zero_bit find_next_zero_bit_le #define ocfs2_find_next_bit find_next_bit_le static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_set_bit(bit, bitmap); } static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_clear_bit(bit, bitmap); } static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); return ocfs2_test_bit(bit, bitmap); } static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, int start) { int fix = 0, ret, tmpmax; bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); tmpmax = max + fix; start += fix; ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; if (ret > max) return max; return ret; } #endif /* OCFS2_H */
2 2 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 // SPDX-License-Identifier: GPL-2.0+ /* * comedi/drivers/rti800.c * Hardware driver for Analog Devices RTI-800/815 board * * COMEDI - Linux Control and Measurement Device Interface * Copyright (C) 1998 David A. Schleef <ds@schleef.org> */ /* * Driver: rti800 * Description: Analog Devices RTI-800/815 * Devices: [Analog Devices] RTI-800 (rti800), RTI-815 (rti815) * Author: David A. Schleef <ds@schleef.org> * Status: unknown * Updated: Fri, 05 Sep 2008 14:50:44 +0100 * * Configuration options: * [0] - I/O port base address * [1] - IRQ (not supported / unused) * [2] - A/D mux/reference (number of channels) * 0 = differential * 1 = pseudodifferential (common) * 2 = single-ended * [3] - A/D range * 0 = [-10,10] * 1 = [-5,5] * 2 = [0,10] * [4] - A/D encoding * 0 = two's complement * 1 = straight binary * [5] - DAC 0 range * 0 = [-10,10] * 1 = [0,10] * [6] - DAC 0 encoding * 0 = two's complement * 1 = straight binary * [7] - DAC 1 range (same as DAC 0) * [8] - DAC 1 encoding (same as DAC 0) */ #include <linux/module.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/comedi/comedidev.h> /* * Register map */ #define RTI800_CSR 0x00 #define RTI800_CSR_BUSY BIT(7) #define RTI800_CSR_DONE BIT(6) #define RTI800_CSR_OVERRUN BIT(5) #define RTI800_CSR_TCR BIT(4) #define RTI800_CSR_DMA_ENAB BIT(3) #define RTI800_CSR_INTR_TC BIT(2) #define RTI800_CSR_INTR_EC BIT(1) #define RTI800_CSR_INTR_OVRN BIT(0) #define RTI800_MUXGAIN 0x01 #define RTI800_CONVERT 0x02 #define RTI800_ADCLO 0x03 #define RTI800_ADCHI 0x04 #define RTI800_DAC0LO 0x05 #define RTI800_DAC0HI 0x06 #define RTI800_DAC1LO 0x07 #define RTI800_DAC1HI 0x08 #define RTI800_CLRFLAGS 0x09 #define RTI800_DI 0x0a #define RTI800_DO 0x0b #define RTI800_9513A_DATA 0x0c #define RTI800_9513A_CNTRL 0x0d #define RTI800_9513A_STATUS 0x0d static const struct comedi_lrange range_rti800_ai_10_bipolar = { 4, { BIP_RANGE(10), BIP_RANGE(1), BIP_RANGE(0.1), BIP_RANGE(0.02) } }; static const struct comedi_lrange range_rti800_ai_5_bipolar = { 4, { BIP_RANGE(5), BIP_RANGE(0.5), BIP_RANGE(0.05), BIP_RANGE(0.01) } }; static const struct comedi_lrange range_rti800_ai_unipolar = { 4, { UNI_RANGE(10), UNI_RANGE(1), UNI_RANGE(0.1), UNI_RANGE(0.02) } }; static const struct comedi_lrange *const rti800_ai_ranges[] = { &range_rti800_ai_10_bipolar, &range_rti800_ai_5_bipolar, &range_rti800_ai_unipolar, }; static const struct comedi_lrange *const rti800_ao_ranges[] = { &range_bipolar10, &range_unipolar10, }; struct rti800_board { const char *name; int has_ao; }; static const struct rti800_board rti800_boardtypes[] = { { .name = "rti800", }, { .name = "rti815", .has_ao = 1, }, }; struct rti800_private { bool adc_2comp; bool dac_2comp[2]; const struct comedi_lrange *ao_range_type_list[2]; unsigned char muxgain_bits; }; static int rti800_ai_eoc(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned long context) { unsigned char status; status = inb(dev->iobase + RTI800_CSR); if (status & RTI800_CSR_OVERRUN) { outb(0, dev->iobase + RTI800_CLRFLAGS); return -EOVERFLOW; } if (status & RTI800_CSR_DONE) return 0; return -EBUSY; } static int rti800_ai_insn_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct rti800_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); unsigned int gain = CR_RANGE(insn->chanspec); unsigned char muxgain_bits; int ret; int i; inb(dev->iobase + RTI800_ADCHI); outb(0, dev->iobase + RTI800_CLRFLAGS); muxgain_bits = chan | (gain << 5); if (muxgain_bits != devpriv->muxgain_bits) { devpriv->muxgain_bits = muxgain_bits; outb(devpriv->muxgain_bits, dev->iobase + RTI800_MUXGAIN); /* * Without a delay here, the RTI_CSR_OVERRUN bit * gets set, and you will have an error. */ if (insn->n > 0) { int delay = (gain == 0) ? 10 : (gain == 1) ? 20 : (gain == 2) ? 40 : 80; udelay(delay); } } for (i = 0; i < insn->n; i++) { unsigned int val; outb(0, dev->iobase + RTI800_CONVERT); ret = comedi_timeout(dev, s, insn, rti800_ai_eoc, 0); if (ret) return ret; val = inb(dev->iobase + RTI800_ADCLO); val |= (inb(dev->iobase + RTI800_ADCHI) & 0xf) << 8; if (devpriv->adc_2comp) val = comedi_offset_munge(s, val); data[i] = val; } return insn->n; } static int rti800_ao_insn_write(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct rti800_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); int reg_lo = chan ? RTI800_DAC1LO : RTI800_DAC0LO; int reg_hi = chan ? RTI800_DAC1HI : RTI800_DAC0HI; int i; for (i = 0; i < insn->n; i++) { unsigned int val = data[i]; s->readback[chan] = val; if (devpriv->dac_2comp[chan]) val = comedi_offset_munge(s, val); outb(val & 0xff, dev->iobase + reg_lo); outb((val >> 8) & 0xff, dev->iobase + reg_hi); } return insn->n; } static int rti800_di_insn_bits(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { data[1] = inb(dev->iobase + RTI800_DI); return insn->n; } static int rti800_do_insn_bits(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { if (comedi_dio_update_state(s, data)) { /* Outputs are inverted... */ outb(s->state ^ 0xff, dev->iobase + RTI800_DO); } data[1] = s->state; return insn->n; } static int rti800_attach(struct comedi_device *dev, struct comedi_devconfig *it) { const struct rti800_board *board = dev->board_ptr; struct rti800_private *devpriv; struct comedi_subdevice *s; int ret; ret = comedi_request_region(dev, it->options[0], 0x10); if (ret) return ret; outb(0, dev->iobase + RTI800_CSR); inb(dev->iobase + RTI800_ADCHI); outb(0, dev->iobase + RTI800_CLRFLAGS); devpriv = comedi_alloc_devpriv(dev, sizeof(*devpriv)); if (!devpriv) return -ENOMEM; devpriv->adc_2comp = (it->options[4] == 0); devpriv->dac_2comp[0] = (it->options[6] == 0); devpriv->dac_2comp[1] = (it->options[8] == 0); /* invalid, forces the MUXGAIN register to be set when first used */ devpriv->muxgain_bits = 0xff; ret = comedi_alloc_subdevices(dev, 4); if (ret) return ret; s = &dev->subdevices[0]; /* ai subdevice */ s->type = COMEDI_SUBD_AI; s->subdev_flags = SDF_READABLE | SDF_GROUND; s->n_chan = (it->options[2] ? 16 : 8); s->insn_read = rti800_ai_insn_read; s->maxdata = 0x0fff; s->range_table = (it->options[3] < ARRAY_SIZE(rti800_ai_ranges)) ? rti800_ai_ranges[it->options[3]] : &range_unknown; s = &dev->subdevices[1]; if (board->has_ao) { /* ao subdevice (only on rti815) */ s->type = COMEDI_SUBD_AO; s->subdev_flags = SDF_WRITABLE; s->n_chan = 2; s->maxdata = 0x0fff; s->range_table_list = devpriv->ao_range_type_list; devpriv->ao_range_type_list[0] = (it->options[5] < ARRAY_SIZE(rti800_ao_ranges)) ? rti800_ao_ranges[it->options[5]] : &range_unknown; devpriv->ao_range_type_list[1] = (it->options[7] < ARRAY_SIZE(rti800_ao_ranges)) ? rti800_ao_ranges[it->options[7]] : &range_unknown; s->insn_write = rti800_ao_insn_write; ret = comedi_alloc_subdev_readback(s); if (ret) return ret; } else { s->type = COMEDI_SUBD_UNUSED; } s = &dev->subdevices[2]; /* di */ s->type = COMEDI_SUBD_DI; s->subdev_flags = SDF_READABLE; s->n_chan = 8; s->insn_bits = rti800_di_insn_bits; s->maxdata = 1; s->range_table = &range_digital; s = &dev->subdevices[3]; /* do */ s->type = COMEDI_SUBD_DO; s->subdev_flags = SDF_WRITABLE; s->n_chan = 8; s->insn_bits = rti800_do_insn_bits; s->maxdata = 1; s->range_table = &range_digital; /* * There is also an Am9513 timer on these boards. This subdevice * is not currently supported. */ return 0; } static struct comedi_driver rti800_driver = { .driver_name = "rti800", .module = THIS_MODULE, .attach = rti800_attach, .detach = comedi_legacy_detach, .num_names = ARRAY_SIZE(rti800_boardtypes), .board_name = &rti800_boardtypes[0].name, .offset = sizeof(struct rti800_board), }; module_comedi_driver(rti800_driver); MODULE_DESCRIPTION("Comedi: RTI-800 Multifunction Analog/Digital board"); MODULE_AUTHOR("Comedi https://www.comedi.org"); MODULE_LICENSE("GPL");
1 2 1 5 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2006 Intel Corporation. All rights reserved. */ #ifndef IB_SA_H #define IB_SA_H #include <linux/completion.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/netdevice.h> #include <rdma/ib_verbs.h> #include <rdma/ib_mad.h> #include <rdma/ib_addr.h> #include <rdma/opa_addr.h> enum { IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */ IB_SA_METHOD_GET_TABLE = 0x12, IB_SA_METHOD_GET_TABLE_RESP = 0x92, IB_SA_METHOD_DELETE = 0x15, IB_SA_METHOD_DELETE_RESP = 0x95, IB_SA_METHOD_GET_MULTI = 0x14, IB_SA_METHOD_GET_MULTI_RESP = 0x94, IB_SA_METHOD_GET_TRACE_TBL = 0x13 }; #define OPA_SA_CLASS_VERSION 0x80 enum { IB_SA_ATTR_CLASS_PORTINFO = 0x01, IB_SA_ATTR_NOTICE = 0x02, IB_SA_ATTR_INFORM_INFO = 0x03, IB_SA_ATTR_NODE_REC = 0x11, IB_SA_ATTR_PORT_INFO_REC = 0x12, IB_SA_ATTR_SL2VL_REC = 0x13, IB_SA_ATTR_SWITCH_REC = 0x14, IB_SA_ATTR_LINEAR_FDB_REC = 0x15, IB_SA_ATTR_RANDOM_FDB_REC = 0x16, IB_SA_ATTR_MCAST_FDB_REC = 0x17, IB_SA_ATTR_SM_INFO_REC = 0x18, IB_SA_ATTR_LINK_REC = 0x20, IB_SA_ATTR_GUID_INFO_REC = 0x30, IB_SA_ATTR_SERVICE_REC = 0x31, IB_SA_ATTR_PARTITION_REC = 0x33, IB_SA_ATTR_PATH_REC = 0x35, IB_SA_ATTR_VL_ARB_REC = 0x36, IB_SA_ATTR_MC_MEMBER_REC = 0x38, IB_SA_ATTR_TRACE_REC = 0x39, IB_SA_ATTR_MULTI_PATH_REC = 0x3a, IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b, IB_SA_ATTR_INFORM_INFO_REC = 0xf3 }; enum ib_sa_selector { IB_SA_GT = 0, IB_SA_LT = 1, IB_SA_EQ = 2, /* * The meaning of "best" depends on the attribute: for * example, for MTU best will return the largest available * MTU, while for packet life time, best will return the * smallest available life time. */ IB_SA_BEST = 3 }; /* * There are 4 types of join states: * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. * The order corresponds to JoinState bits in MCMemberRecord. */ enum ib_sa_mc_join_states { FULLMEMBER_JOIN, NONMEMBER_JOIN, SENDONLY_NONMEBER_JOIN, SENDONLY_FULLMEMBER_JOIN, NUM_JOIN_MEMBERSHIP_TYPES, }; #define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12) /* * Structures for SA records are named "struct ib_sa_xxx_rec." No * attempt is made to pack structures to match the physical layout of * SA records in SA MADs; all packing and unpacking is handled by the * SA query code. * * For a record with structure ib_sa_xxx_rec, the naming convention * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we * never use different abbreviations or otherwise change the spelling * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY). * * Reserved rows are indicated with comments to help maintainability. */ #define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\ IB_SA_COMP_MASK( 1)) #define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2) #define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3) #define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4) #define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5) #define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6) /* reserved: 7 */ #define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8) #define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9) #define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10) #define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11) #define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12) #define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13) #define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14) #define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15) #define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16) #define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17) #define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18) #define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19) #define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20) #define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21) #define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22) enum sa_path_rec_type { SA_PATH_REC_TYPE_IB, SA_PATH_REC_TYPE_ROCE_V1, SA_PATH_REC_TYPE_ROCE_V2, SA_PATH_REC_TYPE_OPA }; struct sa_path_rec_ib { __be16 dlid; __be16 slid; u8 raw_traffic; }; /** * struct sa_path_rec_roce - RoCE specific portion of the path record entry * @route_resolved: When set, it indicates that this route is already * resolved for this path record entry. * @dmac: Destination mac address for the given DGID entry * of the path record entry. */ struct sa_path_rec_roce { bool route_resolved; u8 dmac[ETH_ALEN]; }; struct sa_path_rec_opa { __be32 dlid; __be32 slid; u8 raw_traffic; u8 l2_8B; u8 l2_10B; u8 l2_9B; u8 l2_16B; u8 qos_type; u8 qos_priority; }; struct sa_path_rec { union ib_gid dgid; union ib_gid sgid; __be64 service_id; /* reserved */ __be32 flow_label; u8 hop_limit; u8 traffic_class; u8 reversible; u8 numb_path; __be16 pkey; __be16 qos_class; u8 sl; u8 mtu_selector; u8 mtu; u8 rate_selector; u8 rate; u8 packet_life_time_selector; u8 packet_life_time; u8 preference; union { struct sa_path_rec_ib ib; struct sa_path_rec_roce roce; struct sa_path_rec_opa opa; }; enum sa_path_rec_type rec_type; u32 flags; }; struct sa_service_rec { __be64 id; __u8 gid[16]; __be16 pkey; __u8 reserved[2]; __be32 lease; __u8 key[16]; __u8 name[64]; __u8 data_8[16]; __be16 data_16[8]; __be32 data_32[4]; __be64 data_64[2]; }; static inline enum ib_gid_type sa_conv_pathrec_to_gid_type(struct sa_path_rec *rec) { switch (rec->rec_type) { case SA_PATH_REC_TYPE_ROCE_V1: return IB_GID_TYPE_ROCE; case SA_PATH_REC_TYPE_ROCE_V2: return IB_GID_TYPE_ROCE_UDP_ENCAP; default: return IB_GID_TYPE_IB; } } static inline enum sa_path_rec_type sa_conv_gid_to_pathrec_type(enum ib_gid_type type) { switch (type) { case IB_GID_TYPE_ROCE: return SA_PATH_REC_TYPE_ROCE_V1; case IB_GID_TYPE_ROCE_UDP_ENCAP: return SA_PATH_REC_TYPE_ROCE_V2; default: return SA_PATH_REC_TYPE_IB; } } static inline void path_conv_opa_to_ib(struct sa_path_rec *ib, struct sa_path_rec *opa) { if ((be32_to_cpu(opa->opa.dlid) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) || (be32_to_cpu(opa->opa.slid) >= be16_to_cpu(IB_MULTICAST_LID_BASE))) { /* Create OPA GID and zero out the LID */ ib->dgid.global.interface_id = OPA_MAKE_ID(be32_to_cpu(opa->opa.dlid)); ib->dgid.global.subnet_prefix = opa->dgid.global.subnet_prefix; ib->sgid.global.interface_id = OPA_MAKE_ID(be32_to_cpu(opa->opa.slid)); ib->dgid.global.subnet_prefix = opa->dgid.global.subnet_prefix; ib->ib.dlid = 0; ib->ib.slid = 0; } else { ib->ib.dlid = htons(ntohl(opa->opa.dlid)); ib->ib.slid = htons(ntohl(opa->opa.slid)); } ib->service_id = opa->service_id; ib->ib.raw_traffic = opa->opa.raw_traffic; } static inline void path_conv_ib_to_opa(struct sa_path_rec *opa, struct sa_path_rec *ib) { __be32 slid, dlid; if ((ib_is_opa_gid(&ib->sgid)) || (ib_is_opa_gid(&ib->dgid))) { slid = htonl(opa_get_lid_from_gid(&ib->sgid)); dlid = htonl(opa_get_lid_from_gid(&ib->dgid)); } else { slid = htonl(ntohs(ib->ib.slid)); dlid = htonl(ntohs(ib->ib.dlid)); } opa->opa.slid = slid; opa->opa.dlid = dlid; opa->service_id = ib->service_id; opa->opa.raw_traffic = ib->ib.raw_traffic; } /* Convert from OPA to IB path record */ static inline void sa_convert_path_opa_to_ib(struct sa_path_rec *dest, struct sa_path_rec *src) { if (src->rec_type != SA_PATH_REC_TYPE_OPA) return; *dest = *src; dest->rec_type = SA_PATH_REC_TYPE_IB; path_conv_opa_to_ib(dest, src); } /* Convert from IB to OPA path record */ static inline void sa_convert_path_ib_to_opa(struct sa_path_rec *dest, struct sa_path_rec *src) { if (src->rec_type != SA_PATH_REC_TYPE_IB) return; /* Do a structure copy and overwrite the relevant fields */ *dest = *src; dest->rec_type = SA_PATH_REC_TYPE_OPA; path_conv_ib_to_opa(dest, src); } #define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) #define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1) #define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2) #define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3) #define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4) #define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5) #define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6) #define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7) #define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8) #define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9) #define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10) #define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11) #define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12) #define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13) #define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14) #define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15) #define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16) #define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17) struct ib_sa_mcmember_rec { union ib_gid mgid; union ib_gid port_gid; __be32 qkey; __be16 mlid; u8 mtu_selector; u8 mtu; u8 traffic_class; __be16 pkey; u8 rate_selector; u8 rate; u8 packet_life_time_selector; u8 packet_life_time; u8 sl; __be32 flow_label; u8 hop_limit; u8 scope; u8 join_state; u8 proxy_join; }; /* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */ #define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0) #define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1) #define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2) /* reserved: 3 */ #define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4) #define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5) #define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6) #define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7) #define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8) #define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9) #define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10) #define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11) #define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12) #define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13) #define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14) #define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15) #define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16) #define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17) #define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18) #define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19) #define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20) #define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21) #define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22) #define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23) #define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24) #define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25) #define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26) #define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27) #define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28) #define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29) #define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30) #define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31) #define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32) #define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33) #define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34) #define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35) #define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36) #define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF #define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0) #define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1) #define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2) #define IB_SA_GUIDINFO_REC_RES2 IB_SA_COMP_MASK(3) #define IB_SA_GUIDINFO_REC_GID0 IB_SA_COMP_MASK(4) #define IB_SA_GUIDINFO_REC_GID1 IB_SA_COMP_MASK(5) #define IB_SA_GUIDINFO_REC_GID2 IB_SA_COMP_MASK(6) #define IB_SA_GUIDINFO_REC_GID3 IB_SA_COMP_MASK(7) #define IB_SA_GUIDINFO_REC_GID4 IB_SA_COMP_MASK(8) #define IB_SA_GUIDINFO_REC_GID5 IB_SA_COMP_MASK(9) #define IB_SA_GUIDINFO_REC_GID6 IB_SA_COMP_MASK(10) #define IB_SA_GUIDINFO_REC_GID7 IB_SA_COMP_MASK(11) struct ib_sa_guidinfo_rec { __be16 lid; u8 block_num; /* reserved */ u8 res1; __be32 res2; u8 guid_info_list[64]; }; struct ib_sa_client { atomic_t users; struct completion comp; }; /** * ib_sa_register_client - Register an SA client. */ void ib_sa_register_client(struct ib_sa_client *client); /** * ib_sa_unregister_client - Deregister an SA client. * @client: Client object to deregister. */ void ib_sa_unregister_client(struct ib_sa_client *client); struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, unsigned int num_prs, void *context), void *context, struct ib_sa_query **query); int ib_sa_service_rec_get(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct sa_service_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_service_rec *resp, unsigned int num_services, void *context), void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; ib_sa_comp_mask comp_mask; int (*callback)(int status, struct ib_sa_multicast *multicast); void *context; }; /** * ib_sa_join_multicast - Initiates a join request to the specified multicast * group. * @client: SA client * @device: Device associated with the multicast group. * @port_num: Port on the specified device to associate with the multicast * group. * @rec: SA multicast member record specifying group attributes. * @comp_mask: Component mask indicating which group attributes of %rec are * valid. * @gfp_mask: GFP mask for memory allocations. * @callback: User callback invoked once the join operation completes. * @context: User specified context stored with the ib_sa_multicast structure. * * This call initiates a multicast join request with the SA for the specified * multicast group. If the join operation is started successfully, it returns * an ib_sa_multicast structure that is used to track the multicast operation. * Users must free this structure by calling ib_free_multicast, even if the * join operation later fails. (The callback status is non-zero.) * * If the join operation fails; status will be non-zero, with the following * failures possible: * -ETIMEDOUT: The request timed out. * -EIO: An error occurred sending the query. * -EINVAL: The MCMemberRecord values differed from the existing group's. * -ENETRESET: Indicates that an fatal error has occurred on the multicast * group, and the user must rejoin the group to continue using it. */ struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context); /** * ib_free_multicast - Frees the multicast tracking structure, and releases * any reference on the multicast group. * @multicast: Multicast tracking structure allocated by ib_join_multicast. * * This call blocks until the multicast identifier is destroyed. It may * not be called from within the multicast callback; however, returning a non- * zero value from the callback will result in destroying the multicast * tracking structure. */ void ib_sa_free_multicast(struct ib_sa_multicast *multicast); /** * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and * returns it if found. * @device: Device associated with the multicast group. * @port_num: Port on the specified device to associate with the multicast * group. * @mgid: MGID of multicast group. * @rec: Location to copy SA multicast member record. */ int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec); /** * ib_init_ah_from_mcmember - Initialize address handle attributes based on * an SA multicast member record. */ int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, struct rdma_ah_attr *ah_attr); int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *sgid_attr); /** * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec * to IB MAD wire format. */ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute); /** * ib_sa_pack_service - Convert a service record from struct ib_sa_service_rec * to IB MAD wire format. */ void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute); /** * ib_sa_unpack_service - Convert a service record from MAD format to struct * ib_sa_service_rec. */ void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec); /** * ib_sa_unpack_path - Convert a path record from MAD format to struct * ib_sa_path_rec. */ void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec); /* Support GuidInfoRecord */ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), void *context, struct ib_sa_query **sa_query); static inline bool sa_path_is_roce(struct sa_path_rec *rec) { return ((rec->rec_type == SA_PATH_REC_TYPE_ROCE_V1) || (rec->rec_type == SA_PATH_REC_TYPE_ROCE_V2)); } static inline bool sa_path_is_opa(struct sa_path_rec *rec) { return (rec->rec_type == SA_PATH_REC_TYPE_OPA); } static inline void sa_path_set_slid(struct sa_path_rec *rec, u32 slid) { if (rec->rec_type == SA_PATH_REC_TYPE_IB) rec->ib.slid = cpu_to_be16(slid); else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) rec->opa.slid = cpu_to_be32(slid); } static inline void sa_path_set_dlid(struct sa_path_rec *rec, u32 dlid) { if (rec->rec_type == SA_PATH_REC_TYPE_IB) rec->ib.dlid = cpu_to_be16(dlid); else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) rec->opa.dlid = cpu_to_be32(dlid); } static inline void sa_path_set_raw_traffic(struct sa_path_rec *rec, u8 raw_traffic) { if (rec->rec_type == SA_PATH_REC_TYPE_IB) rec->ib.raw_traffic = raw_traffic; else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) rec->opa.raw_traffic = raw_traffic; } static inline __be32 sa_path_get_slid(struct sa_path_rec *rec) { if (rec->rec_type == SA_PATH_REC_TYPE_IB) return htonl(ntohs(rec->ib.slid)); else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) return rec->opa.slid; return 0; } static inline __be32 sa_path_get_dlid(struct sa_path_rec *rec) { if (rec->rec_type == SA_PATH_REC_TYPE_IB) return htonl(ntohs(rec->ib.dlid)); else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) return rec->opa.dlid; return 0; } static inline u8 sa_path_get_raw_traffic(struct sa_path_rec *rec) { if (rec->rec_type == SA_PATH_REC_TYPE_IB) return rec->ib.raw_traffic; else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) return rec->opa.raw_traffic; return 0; } static inline void sa_path_set_dmac(struct sa_path_rec *rec, u8 *dmac) { if (sa_path_is_roce(rec)) memcpy(rec->roce.dmac, dmac, ETH_ALEN); } static inline void sa_path_set_dmac_zero(struct sa_path_rec *rec) { if (sa_path_is_roce(rec)) eth_zero_addr(rec->roce.dmac); } static inline u8 *sa_path_get_dmac(struct sa_path_rec *rec) { if (sa_path_is_roce(rec)) return rec->roce.dmac; return NULL; } #endif /* IB_SA_H */
3 3 1 1 3 3 3 2 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 /* * linux/fs/hfs/mdb.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains functions for reading/writing the MDB. */ #include <linux/cdrom.h> #include <linux/blkdev.h> #include <linux/nls.h> #include <linux/slab.h> #include "hfs_fs.h" #include "btree.h" /*================ File-local data types ================*/ /* * The HFS Master Directory Block (MDB). * * Also known as the Volume Information Block (VIB), this structure is * the HFS equivalent of a superblock. * * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62 * * modified for HFS Extended */ static int hfs_get_last_session(struct super_block *sb, sector_t *start, sector_t *size) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); /* default values */ *start = 0; *size = bdev_nr_sectors(sb->s_bdev); if (HFS_SB(sb)->session >= 0) { struct cdrom_tocentry te; if (!cdi) return -EINVAL; te.cdte_track = HFS_SB(sb)->session; te.cdte_format = CDROM_LBA; if (cdrom_read_tocentry(cdi, &te) || (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) { pr_err("invalid session number or type of track\n"); return -EINVAL; } *start = (sector_t)te.cdte_addr.lba << 2; } else if (cdi) { struct cdrom_multisession ms_info; ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag) *start = (sector_t)ms_info.addr.lba << 2; } return 0; } /* * hfs_mdb_get() * * Build the in-core MDB for a filesystem, including * the B-trees and the volume bitmap. */ int hfs_mdb_get(struct super_block *sb) { struct buffer_head *bh; struct hfs_mdb *mdb, *mdb2; unsigned int block; char *ptr; int off2, len, size, sect; sector_t part_start, part_size; loff_t off; __be16 attrib; /* set the device driver to 512-byte blocks */ size = sb_min_blocksize(sb, HFS_SECTOR_SIZE); if (!size) return -EINVAL; if (hfs_get_last_session(sb, &part_start, &part_size)) return -EINVAL; while (1) { /* See if this is an HFS filesystem */ bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); if (!bh) goto out; if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) break; brelse(bh); /* check for a partition block * (should do this only for cdrom/loop though) */ if (hfs_part_find(sb, &part_start, &part_size)) goto out; } HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); if (!size || (size & (HFS_SECTOR_SIZE - 1))) { pr_err("bad allocation block size %d\n", size); goto out_bh; } size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE); /* size must be a multiple of 512 */ while (size & (size - 1)) size -= HFS_SECTOR_SIZE; sect = be16_to_cpu(mdb->drAlBlSt) + part_start; /* align block size to first sector */ while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS)) size >>= 1; /* align block size to weird alloc size */ while (HFS_SB(sb)->alloc_blksz & (size - 1)) size >>= 1; brelse(bh); if (!sb_set_blocksize(sb, size)) { pr_err("unable to set blocksize to %u\n", size); goto out; } bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); if (!bh) goto out; if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC)) goto out_bh; HFS_SB(sb)->mdb_bh = bh; HFS_SB(sb)->mdb = mdb; /* These parameters are read from the MDB, and never written */ HFS_SB(sb)->part_start = part_start; HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks); HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits; HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) / HFS_SB(sb)->alloc_blksz; if (!HFS_SB(sb)->clumpablks) HFS_SB(sb)->clumpablks = 1; HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >> (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS); /* These parameters are read from and written to the MDB */ HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID)); HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt)); atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt)); /* TRY to get the alternate (backup) MDB. */ sect = part_start + part_size - 2; bh = sb_bread512(sb, sect, mdb2); if (bh) { if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) { HFS_SB(sb)->alt_mdb_bh = bh; HFS_SB(sb)->alt_mdb = mdb2; } else brelse(bh); } if (!HFS_SB(sb)->alt_mdb) { pr_warn("unable to locate alternate MDB\n"); pr_warn("continuing without an alternate MDB\n"); } HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) goto out; /* read in the bitmap */ block = be16_to_cpu(mdb->drVBMSt) + part_start; off = (loff_t)block << HFS_SECTOR_SIZE_BITS; size = (HFS_SB(sb)->fs_ablocks + 8) / 8; ptr = (u8 *)HFS_SB(sb)->bitmap; while (size) { bh = sb_bread(sb, off >> sb->s_blocksize_bits); if (!bh) { pr_err("unable to read volume bitmap\n"); goto out; } off2 = off & (sb->s_blocksize - 1); len = min((int)sb->s_blocksize - off2, size); memcpy(ptr, bh->b_data + off2, len); brelse(bh); ptr += len; off += len; size -= len; } HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); if (!HFS_SB(sb)->ext_tree) { pr_err("unable to open extent tree\n"); goto out; } HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); if (!HFS_SB(sb)->cat_tree) { pr_err("unable to open catalog tree\n"); goto out; } attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { pr_warn("filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } if (!sb_rdonly(sb)) { /* Mark the volume uncleanly unmounted in case we crash */ attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT); attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT); mdb->drAtrb = attrib; be32_add_cpu(&mdb->drWrCnt, 1); mdb->drLsMod = hfs_mtime(); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); sync_dirty_buffer(HFS_SB(sb)->mdb_bh); } return 0; out_bh: brelse(bh); out: hfs_mdb_put(sb); return -EIO; } /* * hfs_mdb_commit() * * Description: * This updates the MDB on disk. * It does not check, if the superblock has been modified, or * if the filesystem has been mounted read-only. It is mainly * called by hfs_sync_fs() and flush_mdb(). * Input Variable(s): * struct hfs_mdb *mdb: Pointer to the hfs MDB * int backup; * Output Variable(s): * NONE * Returns: * void * Preconditions: * 'mdb' points to a "valid" (struct hfs_mdb). * Postconditions: * The HFS MDB and on disk will be updated, by copying the possibly * modified fields from the in memory MDB (in native byte order) to * the disk block buffer. * If 'backup' is non-zero then the alternate MDB is also written * and the function doesn't return until it is actually on disk. */ void hfs_mdb_commit(struct super_block *sb) { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; if (sb_rdonly(sb)) return; lock_buffer(HFS_SB(sb)->mdb_bh); if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) { /* These parameters may have been modified, so write them back */ mdb->drLsMod = hfs_mtime(); mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); BUG_ON(atomic64_read(&HFS_SB(sb)->next_id) > U32_MAX); mdb->drNxtCNID = cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id)); mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX); mdb->drFilCnt = cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count)); BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); mdb->drDirCnt = cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count)); /* write MDB to disk */ mark_buffer_dirty(HFS_SB(sb)->mdb_bh); } /* write the backup MDB, not returning until it is written. * we only do this when either the catalog or extents overflow * files grow. */ if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) && HFS_SB(sb)->alt_mdb) { hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec, &mdb->drXTFlSize, NULL); hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec, &mdb->drCTFlSize, NULL); lock_buffer(HFS_SB(sb)->alt_mdb_bh); memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE); HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); unlock_buffer(HFS_SB(sb)->alt_mdb_bh); mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); } if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { struct buffer_head *bh; sector_t block; char *ptr; int off, size, len; block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start; off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1); block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS; size = (HFS_SB(sb)->fs_ablocks + 7) / 8; ptr = (u8 *)HFS_SB(sb)->bitmap; while (size) { bh = sb_bread(sb, block); if (!bh) { pr_err("unable to read volume bitmap\n"); break; } len = min((int)sb->s_blocksize - off, size); lock_buffer(bh); memcpy(bh->b_data + off, ptr, len); unlock_buffer(bh); mark_buffer_dirty(bh); brelse(bh); block++; off = 0; ptr += len; size -= len; } } unlock_buffer(HFS_SB(sb)->mdb_bh); } void hfs_mdb_close(struct super_block *sb) { /* update volume attributes */ if (sb_rdonly(sb)) return; HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); } /* * hfs_mdb_put() * * Release the resources associated with the in-core MDB. */ void hfs_mdb_put(struct super_block *sb) { if (!HFS_SB(sb)) return; /* free the B-trees */ hfs_btree_close(HFS_SB(sb)->ext_tree); hfs_btree_close(HFS_SB(sb)->cat_tree); /* free the buffers holding the primary and alternate MDBs */ brelse(HFS_SB(sb)->mdb_bh); brelse(HFS_SB(sb)->alt_mdb_bh); unload_nls(HFS_SB(sb)->nls_io); unload_nls(HFS_SB(sb)->nls_disk); kfree(HFS_SB(sb)->bitmap); kfree(HFS_SB(sb)); sb->s_fs_info = NULL; }
3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* I2C message transfer tracepoints * * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #undef TRACE_SYSTEM #define TRACE_SYSTEM i2c #if !defined(_TRACE_I2C_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_I2C_H #include <linux/i2c.h> #include <linux/tracepoint.h> /* * drivers/i2c/i2c-core-base.c */ extern int i2c_transfer_trace_reg(void); extern void i2c_transfer_trace_unreg(void); /* * __i2c_transfer() write request */ TRACE_EVENT_FN(i2c_write, TP_PROTO(const struct i2c_adapter *adap, const struct i2c_msg *msg, int num), TP_ARGS(adap, msg, num), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, msg_nr ) __field(__u16, addr ) __field(__u16, flags ) __field(__u16, len ) __dynamic_array(__u8, buf, msg->len) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->msg_nr = num; __entry->addr = msg->addr; __entry->flags = msg->flags; __entry->len = msg->len; memcpy(__get_dynamic_array(buf), msg->buf, msg->len); ), TP_printk("i2c-%d #%u a=%03x f=%04x l=%u [%*phD]", __entry->adapter_nr, __entry->msg_nr, __entry->addr, __entry->flags, __entry->len, __entry->len, __get_dynamic_array(buf) ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); /* * __i2c_transfer() read request */ TRACE_EVENT_FN(i2c_read, TP_PROTO(const struct i2c_adapter *adap, const struct i2c_msg *msg, int num), TP_ARGS(adap, msg, num), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, msg_nr ) __field(__u16, addr ) __field(__u16, flags ) __field(__u16, len ) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->msg_nr = num; __entry->addr = msg->addr; __entry->flags = msg->flags; __entry->len = msg->len; ), TP_printk("i2c-%d #%u a=%03x f=%04x l=%u", __entry->adapter_nr, __entry->msg_nr, __entry->addr, __entry->flags, __entry->len ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); /* * __i2c_transfer() read reply */ TRACE_EVENT_FN(i2c_reply, TP_PROTO(const struct i2c_adapter *adap, const struct i2c_msg *msg, int num), TP_ARGS(adap, msg, num), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, msg_nr ) __field(__u16, addr ) __field(__u16, flags ) __field(__u16, len ) __dynamic_array(__u8, buf, msg->len) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->msg_nr = num; __entry->addr = msg->addr; __entry->flags = msg->flags; __entry->len = msg->len; memcpy(__get_dynamic_array(buf), msg->buf, msg->len); ), TP_printk("i2c-%d #%u a=%03x f=%04x l=%u [%*phD]", __entry->adapter_nr, __entry->msg_nr, __entry->addr, __entry->flags, __entry->len, __entry->len, __get_dynamic_array(buf) ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); /* * __i2c_transfer() result */ TRACE_EVENT_FN(i2c_result, TP_PROTO(const struct i2c_adapter *adap, int num, int ret), TP_ARGS(adap, num, ret), TP_STRUCT__entry( __field(int, adapter_nr ) __field(__u16, nr_msgs ) __field(__s16, ret ) ), TP_fast_assign( __entry->adapter_nr = adap->nr; __entry->nr_msgs = num; __entry->ret = ret; ), TP_printk("i2c-%d n=%u ret=%d", __entry->adapter_nr, __entry->nr_msgs, __entry->ret ), i2c_transfer_trace_reg, i2c_transfer_trace_unreg); #endif /* _TRACE_I2C_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
24 4 2 5 20 32 33 9 9 9 9 14 34 34 34 34 12 22 34 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* * net/tipc/addr.c: TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "addr.h" #include "core.h" bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) return true; if (!legacy_format) return false; if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ return true; return false; } void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); tn->trial_addr = hash128to32(id); pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } void tipc_set_node_addr(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); u8 node_id[NODE_ID_LEN] = {0,}; tn->node_addr = addr; if (!tipc_own_id(net)) { sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } tn->trial_addr = addr; tn->addr_trial_end = jiffies; pr_info("Node number set to %u\n", addr); } int tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; /* Already a string ? */ for (i = 0; i < NODE_ID_LEN; i++) { c = id[i]; if (c >= '0' && c <= '9') continue; if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c == '.') continue; if (c == ':') continue; if (c == '_') continue; if (c == '-') continue; if (c == '@') continue; if (c != 0) break; } if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; return i; } /* Translate to hex string */ for (i = 0; i < NODE_ID_LEN; i++) sprintf(&str[2 * i], "%02x", id[i]); /* Strip off trailing zeroes */ for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; return i + 1; }
27 27 27 27 27 27 27 27 27 27 27 27 27 27 1 2 2 2 2 4 2 1 4 10 7 10 13 13 13 2004 468 2006 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netprio_cgroup.c Priority Control Group * * Authors: Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/atomic.h> #include <linux/sched/task.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/netprio_cgroup.h> #include <linux/fdtable.h> /* * netprio allocates per-net_device priomap array which is indexed by * css->id. Limiting css ID to 16bits doesn't lose anything. */ #define NETPRIO_ID_MAX USHRT_MAX #define PRIOMAP_MIN_SZ 128 /* * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ static int extend_netdev_table(struct net_device *dev, u32 target_idx) { struct netprio_map *old, *new; size_t new_sz, new_len; /* is the existing priomap large enough? */ old = rtnl_dereference(dev->priomap); if (old && old->priomap_len > target_idx) return 0; /* * Determine the new size. Let's keep it power-of-two. We start * from PRIOMAP_MIN_SZ and double it until it's large enough to * accommodate @target_idx. */ new_sz = PRIOMAP_MIN_SZ; while (true) { new_len = (new_sz - offsetof(struct netprio_map, priomap)) / sizeof(new->priomap[0]); if (new_len > target_idx) break; new_sz *= 2; /* overflowed? */ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) return -ENOSPC; } /* allocate & copy */ new = kzalloc(new_sz, GFP_KERNEL); if (!new) return -ENOMEM; if (old) memcpy(new->priomap, old->priomap, old->priomap_len * sizeof(old->priomap[0])); new->priomap_len = new_len; /* install the new priomap */ rcu_assign_pointer(dev->priomap, new); if (old) kfree_rcu(old, rcu); return 0; } /** * netprio_prio - return the effective netprio of a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; int id = css->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); if (!prio && (!map || map->priomap_len <= id)) return 0; ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); map->priomap[id] = prio; return 0; } static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; css = kzalloc(sizeof(*css), GFP_KERNEL); if (!css) return ERR_PTR(-ENOMEM); return css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; if (css->id > NETPRIO_ID_MAX) return -ENOSPC; if (!parent_css) return 0; rtnl_lock(); /* * Inherit prios from the parent. As all prios are set during * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { u32 prio = netprio_prio(parent_css, dev); ret = netprio_set_prio(css, dev, prio); if (ret) break; } rtnl_unlock(); return ret; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css); } static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { return css->id; } static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) seq_printf(sf, "%s %u\n", dev->name, netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } static ssize_t write_priomap(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); if (!dev) return -ENODEV; rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); return 0; } static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } } static struct cftype ss_files[] = { { .name = "prioidx", .read_u64 = read_prioidx, }, { .name = "ifpriomap", .seq_show = read_priomap, .write = write_priomap, }, { } /* terminate */ }; struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, .legacy_cftypes = ss_files, }; static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* * Note this is called with rtnl_lock held so we have update side * protection on our rcu assignments */ switch (event) { case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); if (old) kfree_rcu(old, rcu); break; } return NOTIFY_DONE; } static struct notifier_block netprio_device_notifier = { .notifier_call = netprio_device_event }; static int __init init_cgroup_netprio(void) { register_netdevice_notifier(&netprio_device_notifier); return 0; } subsys_initcall(init_cgroup_netprio);
60 60 7 7 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 // SPDX-License-Identifier: GPL-2.0+ /* * Procedures for creating, accessing and interpreting the device tree. * * Paul Mackerras August 1996. * Copyright (C) 1996-2005 Paul Mackerras. * * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner. * {engebret|bergner}@us.ibm.com * * Adapted for sparc and sparc64 by David S. Miller davem@davemloft.net * * Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell and * Grant Likely. */ #define pr_fmt(fmt) "OF: " fmt #include <linux/cleanup.h> #include <linux/console.h> #include <linux/ctype.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_graph.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/proc_fs.h> #include "of_private.h" LIST_HEAD(aliases_lookup); struct device_node *of_root; EXPORT_SYMBOL(of_root); struct device_node *of_chosen; EXPORT_SYMBOL(of_chosen); struct device_node *of_aliases; struct device_node *of_stdout; static const char *of_stdout_options; struct kset *of_kset; /* * Used to protect the of_aliases, to hold off addition of nodes to sysfs. * This mutex must be held whenever modifications are being made to the * device tree. The of_{attach,detach}_node() and * of_{add,remove,update}_property() helpers make sure this happens. */ DEFINE_MUTEX(of_mutex); /* use when traversing tree through the child, sibling, * or parent members of struct device_node. */ DEFINE_RAW_SPINLOCK(devtree_lock); bool of_node_name_eq(const struct device_node *np, const char *name) { const char *node_name; size_t len; if (!np) return false; node_name = kbasename(np->full_name); len = strchrnul(node_name, '@') - node_name; return (strlen(name) == len) && (strncmp(node_name, name, len) == 0); } EXPORT_SYMBOL(of_node_name_eq); bool of_node_name_prefix(const struct device_node *np, const char *prefix) { if (!np) return false; return strncmp(kbasename(np->full_name), prefix, strlen(prefix)) == 0; } EXPORT_SYMBOL(of_node_name_prefix); static bool __of_node_is_type(const struct device_node *np, const char *type) { const char *match = __of_get_property(np, "device_type", NULL); return np && match && type && !strcmp(match, type); } #define EXCLUDED_DEFAULT_CELLS_PLATFORMS ( \ IS_ENABLED(CONFIG_SPARC) || \ of_find_compatible_node(NULL, NULL, "coreboot") \ ) int of_bus_n_addr_cells(struct device_node *np) { u32 cells; for (; np; np = np->parent) { if (!of_property_read_u32(np, "#address-cells", &cells)) return cells; /* * Default root value and walking parent nodes for "#address-cells" * is deprecated. Any platforms which hit this warning should * be added to the excluded list. */ WARN_ONCE(!EXCLUDED_DEFAULT_CELLS_PLATFORMS, "Missing '#address-cells' in %pOF\n", np); } return OF_ROOT_NODE_ADDR_CELLS_DEFAULT; } int of_n_addr_cells(struct device_node *np) { if (np->parent) np = np->parent; return of_bus_n_addr_cells(np); } EXPORT_SYMBOL(of_n_addr_cells); int of_bus_n_size_cells(struct device_node *np) { u32 cells; for (; np; np = np->parent) { if (!of_property_read_u32(np, "#size-cells", &cells)) return cells; /* * Default root value and walking parent nodes for "#size-cells" * is deprecated. Any platforms which hit this warning should * be added to the excluded list. */ WARN_ONCE(!EXCLUDED_DEFAULT_CELLS_PLATFORMS, "Missing '#size-cells' in %pOF\n", np); } return OF_ROOT_NODE_SIZE_CELLS_DEFAULT; } int of_n_size_cells(struct device_node *np) { if (np->parent) np = np->parent; return of_bus_n_size_cells(np); } EXPORT_SYMBOL(of_n_size_cells); #ifdef CONFIG_NUMA int __weak of_node_to_nid(struct device_node *np) { return NUMA_NO_NODE; } #endif #define OF_PHANDLE_CACHE_BITS 7 #define OF_PHANDLE_CACHE_SZ BIT(OF_PHANDLE_CACHE_BITS) static struct device_node *phandle_cache[OF_PHANDLE_CACHE_SZ]; static u32 of_phandle_cache_hash(phandle handle) { return hash_32(handle, OF_PHANDLE_CACHE_BITS); } /* * Caller must hold devtree_lock. */ void __of_phandle_cache_inv_entry(phandle handle) { u32 handle_hash; struct device_node *np; if (!handle) return; handle_hash = of_phandle_cache_hash(handle); np = phandle_cache[handle_hash]; if (np && handle == np->phandle) phandle_cache[handle_hash] = NULL; } void __init of_core_init(void) { struct device_node *np; of_platform_register_reconfig_notifier(); /* Create the kset, and register existing nodes */ mutex_lock(&of_mutex); of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj); if (!of_kset) { mutex_unlock(&of_mutex); pr_err("failed to register existing nodes\n"); return; } for_each_of_allnodes(np) { __of_attach_node_sysfs(np); if (np->phandle && !phandle_cache[of_phandle_cache_hash(np->phandle)]) phandle_cache[of_phandle_cache_hash(np->phandle)] = np; } mutex_unlock(&of_mutex); /* Symlink in /proc as required by userspace ABI */ if (of_root) proc_symlink("device-tree", NULL, "/sys/firmware/devicetree/base"); } static struct property *__of_find_property(const struct device_node *np, const char *name, int *lenp) { struct property *pp; if (!np) return NULL; for (pp = np->properties; pp; pp = pp->next) { if (of_prop_cmp(pp->name, name) == 0) { if (lenp) *lenp = pp->length; break; } } return pp; } struct property *of_find_property(const struct device_node *np, const char *name, int *lenp) { struct property *pp; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); pp = __of_find_property(np, name, lenp); raw_spin_unlock_irqrestore(&devtree_lock, flags); return pp; } EXPORT_SYMBOL(of_find_property); struct device_node *__of_find_all_nodes(struct device_node *prev) { struct device_node *np; if (!prev) { np = of_root; } else if (prev->child) { np = prev->child; } else { /* Walk back up looking for a sibling, or the end of the structure */ np = prev; while (np->parent && !np->sibling) np = np->parent; np = np->sibling; /* Might be null at the end of the tree */ } return np; } /** * of_find_all_nodes - Get next node in global list * @prev: Previous node or NULL to start iteration * of_node_put() will be called on it * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_all_nodes(struct device_node *prev) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); np = __of_find_all_nodes(prev); of_node_get(np); of_node_put(prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_all_nodes); /* * Find a property with a given name for a given node * and return the value. */ const void *__of_get_property(const struct device_node *np, const char *name, int *lenp) { const struct property *pp = __of_find_property(np, name, lenp); return pp ? pp->value : NULL; } /* * Find a property with a given name for a given node * and return the value. */ const void *of_get_property(const struct device_node *np, const char *name, int *lenp) { const struct property *pp = of_find_property(np, name, lenp); return pp ? pp->value : NULL; } EXPORT_SYMBOL(of_get_property); /** * __of_device_is_compatible() - Check if the node matches given constraints * @device: pointer to node * @compat: required compatible string, NULL or "" for any match * @type: required device_type value, NULL or "" for any match * @name: required node name, NULL or "" for any match * * Checks if the given @compat, @type and @name strings match the * properties of the given @device. A constraints can be skipped by * passing NULL or an empty string as the constraint. * * Returns 0 for no match, and a positive integer on match. The return * value is a relative score with larger values indicating better * matches. The score is weighted for the most specific compatible value * to get the highest score. Matching type is next, followed by matching * name. Practically speaking, this results in the following priority * order for matches: * * 1. specific compatible && type && name * 2. specific compatible && type * 3. specific compatible && name * 4. specific compatible * 5. general compatible && type && name * 6. general compatible && type * 7. general compatible && name * 8. general compatible * 9. type && name * 10. type * 11. name */ static int __of_device_is_compatible(const struct device_node *device, const char *compat, const char *type, const char *name) { const struct property *prop; const char *cp; int index = 0, score = 0; /* Compatible match has highest priority */ if (compat && compat[0]) { prop = __of_find_property(device, "compatible", NULL); for (cp = of_prop_next_string(prop, NULL); cp; cp = of_prop_next_string(prop, cp), index++) { if (of_compat_cmp(cp, compat, strlen(compat)) == 0) { score = INT_MAX/2 - (index << 2); break; } } if (!score) return 0; } /* Matching type is better than matching name */ if (type && type[0]) { if (!__of_node_is_type(device, type)) return 0; score += 2; } /* Matching name is a bit better than not */ if (name && name[0]) { if (!of_node_name_eq(device, name)) return 0; score++; } return score; } /** Checks if the given "compat" string matches one of the strings in * the device's "compatible" property */ int of_device_is_compatible(const struct device_node *device, const char *compat) { unsigned long flags; int res; raw_spin_lock_irqsave(&devtree_lock, flags); res = __of_device_is_compatible(device, compat, NULL, NULL); raw_spin_unlock_irqrestore(&devtree_lock, flags); return res; } EXPORT_SYMBOL(of_device_is_compatible); /** Checks if the device is compatible with any of the entries in * a NULL terminated array of strings. Returns the best match * score or 0. */ int of_device_compatible_match(const struct device_node *device, const char *const *compat) { unsigned int tmp, score = 0; if (!compat) return 0; while (*compat) { tmp = of_device_is_compatible(device, *compat); if (tmp > score) score = tmp; compat++; } return score; } EXPORT_SYMBOL_GPL(of_device_compatible_match); /** * of_machine_compatible_match - Test root of device tree against a compatible array * @compats: NULL terminated array of compatible strings to look for in root node's compatible property. * * Returns true if the root node has any of the given compatible values in its * compatible property. */ bool of_machine_compatible_match(const char *const *compats) { struct device_node *root; int rc = 0; root = of_find_node_by_path("/"); if (root) { rc = of_device_compatible_match(root, compats); of_node_put(root); } return rc != 0; } EXPORT_SYMBOL(of_machine_compatible_match); static bool __of_device_is_status(const struct device_node *device, const char * const*strings) { const char *status; int statlen; if (!device) return false; status = __of_get_property(device, "status", &statlen); if (status == NULL) return false; if (statlen > 0) { while (*strings) { unsigned int len = strlen(*strings); if ((*strings)[len - 1] == '-') { if (!strncmp(status, *strings, len)) return true; } else { if (!strcmp(status, *strings)) return true; } strings++; } } return false; } /** * __of_device_is_available - check if a device is available for use * * @device: Node to check for availability, with locks already held * * Return: True if the status property is absent or set to "okay" or "ok", * false otherwise */ static bool __of_device_is_available(const struct device_node *device) { static const char * const ok[] = {"okay", "ok", NULL}; if (!device) return false; return !__of_get_property(device, "status", NULL) || __of_device_is_status(device, ok); } /** * __of_device_is_reserved - check if a device is reserved * * @device: Node to check for availability, with locks already held * * Return: True if the status property is set to "reserved", false otherwise */ static bool __of_device_is_reserved(const struct device_node *device) { static const char * const reserved[] = {"reserved", NULL}; return __of_device_is_status(device, reserved); } /** * of_device_is_available - check if a device is available for use * * @device: Node to check for availability * * Return: True if the status property is absent or set to "okay" or "ok", * false otherwise */ bool of_device_is_available(const struct device_node *device) { unsigned long flags; bool res; raw_spin_lock_irqsave(&devtree_lock, flags); res = __of_device_is_available(device); raw_spin_unlock_irqrestore(&devtree_lock, flags); return res; } EXPORT_SYMBOL(of_device_is_available); /** * __of_device_is_fail - check if a device has status "fail" or "fail-..." * * @device: Node to check status for, with locks already held * * Return: True if the status property is set to "fail" or "fail-..." (for any * error code suffix), false otherwise */ static bool __of_device_is_fail(const struct device_node *device) { static const char * const fail[] = {"fail", "fail-", NULL}; return __of_device_is_status(device, fail); } /** * of_device_is_big_endian - check if a device has BE registers * * @device: Node to check for endianness * * Return: True if the device has a "big-endian" property, or if the kernel * was compiled for BE *and* the device has a "native-endian" property. * Returns false otherwise. * * Callers would nominally use ioread32be/iowrite32be if * of_device_is_big_endian() == true, or readl/writel otherwise. */ bool of_device_is_big_endian(const struct device_node *device) { if (of_property_read_bool(device, "big-endian")) return true; if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && of_property_read_bool(device, "native-endian")) return true; return false; } EXPORT_SYMBOL(of_device_is_big_endian); /** * of_get_parent - Get a node's parent if any * @node: Node to get parent * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_get_parent(const struct device_node *node) { struct device_node *np; unsigned long flags; if (!node) return NULL; raw_spin_lock_irqsave(&devtree_lock, flags); np = of_node_get(node->parent); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_get_parent); /** * of_get_next_parent - Iterate to a node's parent * @node: Node to get parent of * * This is like of_get_parent() except that it drops the * refcount on the passed node, making it suitable for iterating * through a node's parents. * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_get_next_parent(struct device_node *node) { struct device_node *parent; unsigned long flags; if (!node) return NULL; raw_spin_lock_irqsave(&devtree_lock, flags); parent = of_node_get(node->parent); of_node_put(node); raw_spin_unlock_irqrestore(&devtree_lock, flags); return parent; } EXPORT_SYMBOL(of_get_next_parent); static struct device_node *__of_get_next_child(const struct device_node *node, struct device_node *prev) { struct device_node *next; if (!node) return NULL; next = prev ? prev->sibling : node->child; of_node_get(next); of_node_put(prev); return next; } #define __for_each_child_of_node(parent, child) \ for (child = __of_get_next_child(parent, NULL); child != NULL; \ child = __of_get_next_child(parent, child)) /** * of_get_next_child - Iterate a node childs * @node: parent node * @prev: previous child of the parent node, or NULL to get first * * Return: A node pointer with refcount incremented, use of_node_put() on * it when done. Returns NULL when prev is the last child. Decrements the * refcount of prev. */ struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev) { struct device_node *next; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); next = __of_get_next_child(node, prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_child); /** * of_get_next_child_with_prefix - Find the next child node with prefix * @node: parent node * @prev: previous child of the parent node, or NULL to get first * @prefix: prefix that the node name should have * * This function is like of_get_next_child(), except that it automatically * skips any nodes whose name doesn't have the given prefix. * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_get_next_child_with_prefix(const struct device_node *node, struct device_node *prev, const char *prefix) { struct device_node *next; unsigned long flags; if (!node) return NULL; raw_spin_lock_irqsave(&devtree_lock, flags); next = prev ? prev->sibling : node->child; for (; next; next = next->sibling) { if (!of_node_name_prefix(next, prefix)) continue; if (of_node_get(next)) break; } of_node_put(prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_child_with_prefix); static struct device_node *of_get_next_status_child(const struct device_node *node, struct device_node *prev, bool (*checker)(const struct device_node *)) { struct device_node *next; unsigned long flags; if (!node) return NULL; raw_spin_lock_irqsave(&devtree_lock, flags); next = prev ? prev->sibling : node->child; for (; next; next = next->sibling) { if (!checker(next)) continue; if (of_node_get(next)) break; } of_node_put(prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } /** * of_get_next_available_child - Find the next available child node * @node: parent node * @prev: previous child of the parent node, or NULL to get first * * This function is like of_get_next_child(), except that it * automatically skips any disabled nodes (i.e. status = "disabled"). */ struct device_node *of_get_next_available_child(const struct device_node *node, struct device_node *prev) { return of_get_next_status_child(node, prev, __of_device_is_available); } EXPORT_SYMBOL(of_get_next_available_child); /** * of_get_next_reserved_child - Find the next reserved child node * @node: parent node * @prev: previous child of the parent node, or NULL to get first * * This function is like of_get_next_child(), except that it * automatically skips any disabled nodes (i.e. status = "disabled"). */ struct device_node *of_get_next_reserved_child(const struct device_node *node, struct device_node *prev) { return of_get_next_status_child(node, prev, __of_device_is_reserved); } EXPORT_SYMBOL(of_get_next_reserved_child); /** * of_get_next_cpu_node - Iterate on cpu nodes * @prev: previous child of the /cpus node, or NULL to get first * * Unusable CPUs (those with the status property set to "fail" or "fail-...") * will be skipped. * * Return: A cpu node pointer with refcount incremented, use of_node_put() * on it when done. Returns NULL when prev is the last child. Decrements * the refcount of prev. */ struct device_node *of_get_next_cpu_node(struct device_node *prev) { struct device_node *next = NULL; unsigned long flags; struct device_node *node; if (!prev) node = of_find_node_by_path("/cpus"); raw_spin_lock_irqsave(&devtree_lock, flags); if (prev) next = prev->sibling; else if (node) { next = node->child; of_node_put(node); } for (; next; next = next->sibling) { if (__of_device_is_fail(next)) continue; if (!(of_node_name_eq(next, "cpu") || __of_node_is_type(next, "cpu"))) continue; if (of_node_get(next)) break; } of_node_put(prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_cpu_node); /** * of_get_compatible_child - Find compatible child node * @parent: parent node * @compatible: compatible string * * Lookup child node whose compatible property contains the given compatible * string. * * Return: a node pointer with refcount incremented, use of_node_put() on it * when done; or NULL if not found. */ struct device_node *of_get_compatible_child(const struct device_node *parent, const char *compatible) { struct device_node *child; for_each_child_of_node(parent, child) { if (of_device_is_compatible(child, compatible)) break; } return child; } EXPORT_SYMBOL(of_get_compatible_child); /** * of_get_child_by_name - Find the child node by name for a given parent * @node: parent node * @name: child name to look for. * * This function looks for child node for given matching name * * Return: A node pointer if found, with refcount incremented, use * of_node_put() on it when done. * Returns NULL if node is not found. */ struct device_node *of_get_child_by_name(const struct device_node *node, const char *name) { struct device_node *child; for_each_child_of_node(node, child) if (of_node_name_eq(child, name)) break; return child; } EXPORT_SYMBOL(of_get_child_by_name); /** * of_get_available_child_by_name - Find the available child node by name for a given parent * @node: parent node * @name: child name to look for. * * This function looks for child node for given matching name and checks the * device's availability for use. * * Return: A node pointer if found, with refcount incremented, use * of_node_put() on it when done. * Returns NULL if node is not found. */ struct device_node *of_get_available_child_by_name(const struct device_node *node, const char *name) { struct device_node *child; child = of_get_child_by_name(node, name); if (child && !of_device_is_available(child)) { of_node_put(child); return NULL; } return child; } EXPORT_SYMBOL(of_get_available_child_by_name); struct device_node *__of_find_node_by_path(const struct device_node *parent, const char *path) { struct device_node *child; int len; len = strcspn(path, "/:"); if (!len) return NULL; __for_each_child_of_node(parent, child) { const char *name = kbasename(child->full_name); if (strncmp(path, name, len) == 0 && (strlen(name) == len)) return child; } return NULL; } struct device_node *__of_find_node_by_full_path(struct device_node *node, const char *path) { const char *separator = strchr(path, ':'); while (node && *path == '/') { struct device_node *tmp = node; path++; /* Increment past '/' delimiter */ node = __of_find_node_by_path(node, path); of_node_put(tmp); path = strchrnul(path, '/'); if (separator && separator < path) break; } return node; } /** * of_find_node_opts_by_path - Find a node matching a full OF path * @path: Either the full path to match, or if the path does not * start with '/', the name of a property of the /aliases * node (an alias). In the case of an alias, the node * matching the alias' value will be returned. * @opts: Address of a pointer into which to store the start of * an options string appended to the end of the path with * a ':' separator. * * Valid paths: * * /foo/bar Full path * * foo Valid alias * * foo/bar Valid alias + relative path * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_opts_by_path(const char *path, const char **opts) { struct device_node *np = NULL; const struct property *pp; unsigned long flags; const char *separator = strchr(path, ':'); if (opts) *opts = separator ? separator + 1 : NULL; if (strcmp(path, "/") == 0) return of_node_get(of_root); /* The path could begin with an alias */ if (*path != '/') { int len; const char *p = strchrnul(path, '/'); if (separator && separator < p) p = separator; len = p - path; /* of_aliases must not be NULL */ if (!of_aliases) return NULL; for_each_property_of_node(of_aliases, pp) { if (strlen(pp->name) == len && !strncmp(pp->name, path, len)) { np = of_find_node_by_path(pp->value); break; } } if (!np) return NULL; path = p; } /* Step down the tree matching path components */ raw_spin_lock_irqsave(&devtree_lock, flags); if (!np) np = of_node_get(of_root); np = __of_find_node_by_full_path(np, path); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_opts_by_path); /** * of_find_node_by_name - Find a node by its "name" property * @from: The node to start searching from or NULL; the node * you pass will not be searched, only the next one * will. Typically, you pass what the previous call * returned. of_node_put() will be called on @from. * @name: The name string to match against * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_by_name(struct device_node *from, const char *name) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) if (of_node_name_eq(np, name) && of_node_get(np)) break; of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_name); /** * of_find_node_by_type - Find a node by its "device_type" property * @from: The node to start searching from, or NULL to start searching * the entire device tree. The node you pass will not be * searched, only the next one will; typically, you pass * what the previous call returned. of_node_put() will be * called on from for you. * @type: The type string to match against * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_by_type(struct device_node *from, const char *type) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) if (__of_node_is_type(np, type) && of_node_get(np)) break; of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_type); /** * of_find_compatible_node - Find a node based on type and one of the * tokens in its "compatible" property * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. of_node_put() will be called on it * @type: The type string to match "device_type" or NULL to ignore * @compatible: The string to match to one of the tokens in the device * "compatible" list. * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_compatible_node(struct device_node *from, const char *type, const char *compatible) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) if (__of_device_is_compatible(np, compatible, type, NULL) && of_node_get(np)) break; of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_compatible_node); /** * of_find_node_with_property - Find a node which has a property with * the given name. * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. of_node_put() will be called on it * @prop_name: The name of the property to look for. * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_with_property(struct device_node *from, const char *prop_name) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) { if (__of_find_property(np, prop_name, NULL)) { of_node_get(np); break; } } of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_with_property); static const struct of_device_id *__of_match_node(const struct of_device_id *matches, const struct device_node *node) { const struct of_device_id *best_match = NULL; int score, best_score = 0; if (!matches) return NULL; for (; matches->name[0] || matches->type[0] || matches->compatible[0]; matches++) { score = __of_device_is_compatible(node, matches->compatible, matches->type, matches->name); if (score > best_score) { best_match = matches; best_score = score; } } return best_match; } /** * of_match_node - Tell if a device_node has a matching of_match structure * @matches: array of of device match structures to search in * @node: the of device structure to match against * * Low level utility function used by device matching. */ const struct of_device_id *of_match_node(const struct of_device_id *matches, const struct device_node *node) { const struct of_device_id *match; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); match = __of_match_node(matches, node); raw_spin_unlock_irqrestore(&devtree_lock, flags); return match; } EXPORT_SYMBOL(of_match_node); /** * of_find_matching_node_and_match - Find a node based on an of_device_id * match table. * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. of_node_put() will be called on it * @matches: array of of device match structures to search in * @match: Updated to point at the matches entry which matched * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_matching_node_and_match(struct device_node *from, const struct of_device_id *matches, const struct of_device_id **match) { struct device_node *np; const struct of_device_id *m; unsigned long flags; if (match) *match = NULL; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) { m = __of_match_node(matches, np); if (m && of_node_get(np)) { if (match) *match = m; break; } } of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_matching_node_and_match); /** * of_alias_from_compatible - Lookup appropriate alias for a device node * depending on compatible * @node: pointer to a device tree node * @alias: Pointer to buffer that alias value will be copied into * @len: Length of alias value * * Based on the value of the compatible property, this routine will attempt * to choose an appropriate alias value for a particular device tree node. * It does this by stripping the manufacturer prefix (as delimited by a ',') * from the first entry in the compatible list property. * * Note: The matching on just the "product" side of the compatible is a relic * from I2C and SPI. Please do not add any new user. * * Return: This routine returns 0 on success, <0 on failure. */ int of_alias_from_compatible(const struct device_node *node, char *alias, int len) { const char *compatible, *p; int cplen; compatible = of_get_property(node, "compatible", &cplen); if (!compatible || strlen(compatible) > cplen) return -ENODEV; p = strchr(compatible, ','); strscpy(alias, p ? p + 1 : compatible, len); return 0; } EXPORT_SYMBOL_GPL(of_alias_from_compatible); /** * of_find_node_by_phandle - Find a node given a phandle * @handle: phandle of the node to find * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_by_phandle(phandle handle) { struct device_node *np = NULL; unsigned long flags; u32 handle_hash; if (!handle) return NULL; handle_hash = of_phandle_cache_hash(handle); raw_spin_lock_irqsave(&devtree_lock, flags); if (phandle_cache[handle_hash] && handle == phandle_cache[handle_hash]->phandle) np = phandle_cache[handle_hash]; if (!np) { for_each_of_allnodes(np) if (np->phandle == handle && !of_node_check_flag(np, OF_DETACHED)) { phandle_cache[handle_hash] = np; break; } } of_node_get(np); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_phandle); void of_print_phandle_args(const char *msg, const struct of_phandle_args *args) { int i; printk("%s %pOF", msg, args->np); for (i = 0; i < args->args_count; i++) { const char delim = i ? ',' : ':'; pr_cont("%c%08x", delim, args->args[i]); } pr_cont("\n"); } int of_phandle_iterator_init(struct of_phandle_iterator *it, const struct device_node *np, const char *list_name, const char *cells_name, int cell_count) { const __be32 *list; int size; memset(it, 0, sizeof(*it)); /* * one of cell_count or cells_name must be provided to determine the * argument length. */ if (cell_count < 0 && !cells_name) return -EINVAL; list = of_get_property(np, list_name, &size); if (!list) return -ENOENT; it->cells_name = cells_name; it->cell_count = cell_count; it->parent = np; it->list_end = list + size / sizeof(*list); it->phandle_end = list; it->cur = list; return 0; } EXPORT_SYMBOL_GPL(of_phandle_iterator_init); int of_phandle_iterator_next(struct of_phandle_iterator *it) { uint32_t count = 0; if (it->node) { of_node_put(it->node); it->node = NULL; } if (!it->cur || it->phandle_end >= it->list_end) return -ENOENT; it->cur = it->phandle_end; /* If phandle is 0, then it is an empty entry with no arguments. */ it->phandle = be32_to_cpup(it->cur++); if (it->phandle) { /* * Find the provider node and parse the #*-cells property to * determine the argument length. */ it->node = of_find_node_by_phandle(it->phandle); if (it->cells_name) { if (!it->node) { pr_err("%pOF: could not find phandle %d\n", it->parent, it->phandle); goto err; } if (of_property_read_u32(it->node, it->cells_name, &count)) { /* * If both cell_count and cells_name is given, * fall back to cell_count in absence * of the cells_name property */ if (it->cell_count >= 0) { count = it->cell_count; } else { pr_err("%pOF: could not get %s for %pOF\n", it->parent, it->cells_name, it->node); goto err; } } } else { count = it->cell_count; } /* * Make sure that the arguments actually fit in the remaining * property data length */ if (it->cur + count > it->list_end) { if (it->cells_name) pr_err("%pOF: %s = %d found %td\n", it->parent, it->cells_name, count, it->list_end - it->cur); else pr_err("%pOF: phandle %s needs %d, found %td\n", it->parent, of_node_full_name(it->node), count, it->list_end - it->cur); goto err; } } it->phandle_end = it->cur + count; it->cur_count = count; return 0; err: if (it->node) { of_node_put(it->node); it->node = NULL; } return -EINVAL; } EXPORT_SYMBOL_GPL(of_phandle_iterator_next); int of_phandle_iterator_args(struct of_phandle_iterator *it, uint32_t *args, int size) { int i, count; count = it->cur_count; if (WARN_ON(size < count)) count = size; for (i = 0; i < count; i++) args[i] = be32_to_cpup(it->cur++); return count; } int __of_parse_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name, int cell_count, int index, struct of_phandle_args *out_args) { struct of_phandle_iterator it; int rc, cur_index = 0; if (index < 0) return -EINVAL; /* Loop over the phandles until all the requested entry is found */ of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) { /* * All of the error cases bail out of the loop, so at * this point, the parsing is successful. If the requested * index matches, then fill the out_args structure and return, * or return -ENOENT for an empty entry. */ rc = -ENOENT; if (cur_index == index) { if (!it.phandle) goto err; if (out_args) { int c; c = of_phandle_iterator_args(&it, out_args->args, MAX_PHANDLE_ARGS); out_args->np = it.node; out_args->args_count = c; } else { of_node_put(it.node); } /* Found it! return success */ return 0; } cur_index++; } /* * Unlock node before returning result; will be one of: * -ENOENT : index is for empty phandle * -EINVAL : parsing error on data */ err: of_node_put(it.node); return rc; } EXPORT_SYMBOL(__of_parse_phandle_with_args); /** * of_parse_phandle_with_args_map() - Find a node pointed by phandle in a list and remap it * @np: pointer to a device tree node containing a list * @list_name: property name that contains a list * @stem_name: stem of property names that specify phandles' arguments count * @index: index of a phandle to parse out * @out_args: optional pointer to output arguments structure (will be filled) * * This function is useful to parse lists of phandles and their arguments. * Returns 0 on success and fills out_args, on error returns appropriate errno * value. The difference between this function and of_parse_phandle_with_args() * is that this API remaps a phandle if the node the phandle points to has * a <@stem_name>-map property. * * Caller is responsible to call of_node_put() on the returned out_args->np * pointer. * * Example:: * * phandle1: node1 { * #list-cells = <2>; * }; * * phandle2: node2 { * #list-cells = <1>; * }; * * phandle3: node3 { * #list-cells = <1>; * list-map = <0 &phandle2 3>, * <1 &phandle2 2>, * <2 &phandle1 5 1>; * list-map-mask = <0x3>; * }; * * node4 { * list = <&phandle1 1 2 &phandle3 0>; * }; * * To get a device_node of the ``node2`` node you may call this: * of_parse_phandle_with_args(node4, "list", "list", 1, &args); */ int of_parse_phandle_with_args_map(const struct device_node *np, const char *list_name, const char *stem_name, int index, struct of_phandle_args *out_args) { char *cells_name __free(kfree) = kasprintf(GFP_KERNEL, "#%s-cells", stem_name); char *map_name __free(kfree) = kasprintf(GFP_KERNEL, "%s-map", stem_name); char *mask_name __free(kfree) = kasprintf(GFP_KERNEL, "%s-map-mask", stem_name); char *pass_name __free(kfree) = kasprintf(GFP_KERNEL, "%s-map-pass-thru", stem_name); struct device_node *cur, *new = NULL; const __be32 *map, *mask, *pass; static const __be32 dummy_mask[] = { [0 ... (MAX_PHANDLE_ARGS - 1)] = cpu_to_be32(~0) }; static const __be32 dummy_pass[] = { [0 ... (MAX_PHANDLE_ARGS - 1)] = cpu_to_be32(0) }; __be32 initial_match_array[MAX_PHANDLE_ARGS]; const __be32 *match_array = initial_match_array; int i, ret, map_len, match; u32 list_size, new_size; if (index < 0) return -EINVAL; if (!cells_name || !map_name || !mask_name || !pass_name) return -ENOMEM; ret = __of_parse_phandle_with_args(np, list_name, cells_name, -1, index, out_args); if (ret) return ret; /* Get the #<list>-cells property */ cur = out_args->np; ret = of_property_read_u32(cur, cells_name, &list_size); if (ret < 0) goto put; /* Precalculate the match array - this simplifies match loop */ for (i = 0; i < list_size; i++) initial_match_array[i] = cpu_to_be32(out_args->args[i]); ret = -EINVAL; while (cur) { /* Get the <list>-map property */ map = of_get_property(cur, map_name, &map_len); if (!map) { return 0; } map_len /= sizeof(u32); /* Get the <list>-map-mask property (optional) */ mask = of_get_property(cur, mask_name, NULL); if (!mask) mask = dummy_mask; /* Iterate through <list>-map property */ match = 0; while (map_len > (list_size + 1) && !match) { /* Compare specifiers */ match = 1; for (i = 0; i < list_size; i++, map_len--) match &= !((match_array[i] ^ *map++) & mask[i]); of_node_put(new); new = of_find_node_by_phandle(be32_to_cpup(map)); map++; map_len--; /* Check if not found */ if (!new) { ret = -EINVAL; goto put; } if (!of_device_is_available(new)) match = 0; ret = of_property_read_u32(new, cells_name, &new_size); if (ret) goto put; /* Check for malformed properties */ if (WARN_ON(new_size > MAX_PHANDLE_ARGS) || map_len < new_size) { ret = -EINVAL; goto put; } /* Move forward by new node's #<list>-cells amount */ map += new_size; map_len -= new_size; } if (!match) { ret = -ENOENT; goto put; } /* Get the <list>-map-pass-thru property (optional) */ pass = of_get_property(cur, pass_name, NULL); if (!pass) pass = dummy_pass; /* * Successfully parsed a <list>-map translation; copy new * specifier into the out_args structure, keeping the * bits specified in <list>-map-pass-thru. */ for (i = 0; i < new_size; i++) { __be32 val = *(map - new_size + i); if (i < list_size) { val &= ~pass[i]; val |= cpu_to_be32(out_args->args[i]) & pass[i]; } initial_match_array[i] = val; out_args->args[i] = be32_to_cpu(val); } out_args->args_count = list_size = new_size; /* Iterate again with new provider */ out_args->np = new; of_node_put(cur); cur = new; new = NULL; } put: of_node_put(cur); of_node_put(new); return ret; } EXPORT_SYMBOL(of_parse_phandle_with_args_map); /** * of_count_phandle_with_args() - Find the number of phandles references in a property * @np: pointer to a device tree node containing a list * @list_name: property name that contains a list * @cells_name: property name that specifies phandles' arguments count * * Return: The number of phandle + argument tuples within a property. It * is a typical pattern to encode a list of phandle and variable * arguments into a single property. The number of arguments is encoded * by a property in the phandle-target node. For example, a gpios * property would contain a list of GPIO specifies consisting of a * phandle and 1 or more arguments. The number of arguments are * determined by the #gpio-cells property in the node pointed to by the * phandle. */ int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name) { struct of_phandle_iterator it; int rc, cur_index = 0; /* * If cells_name is NULL we assume a cell count of 0. This makes * counting the phandles trivial as each 32bit word in the list is a * phandle and no arguments are to consider. So we don't iterate through * the list but just use the length to determine the phandle count. */ if (!cells_name) { const __be32 *list; int size; list = of_get_property(np, list_name, &size); if (!list) return -ENOENT; return size / sizeof(*list); } rc = of_phandle_iterator_init(&it, np, list_name, cells_name, -1); if (rc) return rc; while ((rc = of_phandle_iterator_next(&it)) == 0) cur_index += 1; if (rc != -ENOENT) return rc; return cur_index; } EXPORT_SYMBOL(of_count_phandle_with_args); static struct property *__of_remove_property_from_list(struct property **list, struct property *prop) { struct property **next; for (next = list; *next; next = &(*next)->next) { if (*next == prop) { *next = prop->next; prop->next = NULL; return prop; } } return NULL; } /** * __of_add_property - Add a property to a node without lock operations * @np: Caller's Device Node * @prop: Property to add */ int __of_add_property(struct device_node *np, struct property *prop) { int rc = 0; unsigned long flags; struct property **next; raw_spin_lock_irqsave(&devtree_lock, flags); __of_remove_property_from_list(&np->deadprops, prop); prop->next = NULL; next = &np->properties; while (*next) { if (of_prop_cmp(prop->name, (*next)->name) == 0) { /* duplicate ! don't insert it */ rc = -EEXIST; goto out_unlock; } next = &(*next)->next; } *next = prop; out_unlock: raw_spin_unlock_irqrestore(&devtree_lock, flags); if (rc) return rc; __of_add_property_sysfs(np, prop); return 0; } /** * of_add_property - Add a property to a node * @np: Caller's Device Node * @prop: Property to add */ int of_add_property(struct device_node *np, struct property *prop) { int rc; mutex_lock(&of_mutex); rc = __of_add_property(np, prop); mutex_unlock(&of_mutex); if (!rc) of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop, NULL); return rc; } EXPORT_SYMBOL_GPL(of_add_property); int __of_remove_property(struct device_node *np, struct property *prop) { unsigned long flags; int rc = -ENODEV; raw_spin_lock_irqsave(&devtree_lock, flags); if (__of_remove_property_from_list(&np->properties, prop)) { /* Found the property, add it to deadprops list */ prop->next = np->deadprops; np->deadprops = prop; rc = 0; } raw_spin_unlock_irqrestore(&devtree_lock, flags); if (rc) return rc; __of_remove_property_sysfs(np, prop); return 0; } /** * of_remove_property - Remove a property from a node. * @np: Caller's Device Node * @prop: Property to remove * * Note that we don't actually remove it, since we have given out * who-knows-how-many pointers to the data using get-property. * Instead we just move the property to the "dead properties" * list, so it won't be found any more. */ int of_remove_property(struct device_node *np, struct property *prop) { int rc; if (!prop) return -ENODEV; mutex_lock(&of_mutex); rc = __of_remove_property(np, prop); mutex_unlock(&of_mutex); if (!rc) of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop, NULL); return rc; } EXPORT_SYMBOL_GPL(of_remove_property); int __of_update_property(struct device_node *np, struct property *newprop, struct property **oldpropp) { struct property **next, *oldprop; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); __of_remove_property_from_list(&np->deadprops, newprop); for (next = &np->properties; *next; next = &(*next)->next) { if (of_prop_cmp((*next)->name, newprop->name) == 0) break; } *oldpropp = oldprop = *next; if (oldprop) { /* replace the node */ newprop->next = oldprop->next; *next = newprop; oldprop->next = np->deadprops; np->deadprops = oldprop; } else { /* new node */ newprop->next = NULL; *next = newprop; } raw_spin_unlock_irqrestore(&devtree_lock, flags); __of_update_property_sysfs(np, newprop, oldprop); return 0; } /* * of_update_property - Update a property in a node, if the property does * not exist, add it. * * Note that we don't actually remove it, since we have given out * who-knows-how-many pointers to the data using get-property. * Instead we just move the property to the "dead properties" list, * and add the new property to the property list */ int of_update_property(struct device_node *np, struct property *newprop) { struct property *oldprop; int rc; if (!newprop->name) return -EINVAL; mutex_lock(&of_mutex); rc = __of_update_property(np, newprop, &oldprop); mutex_unlock(&of_mutex); if (!rc) of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop, oldprop); return rc; } static void of_alias_add(struct alias_prop *ap, struct device_node *np, int id, const char *stem, int stem_len) { ap->np = np; ap->id = id; strscpy(ap->stem, stem, stem_len + 1); list_add_tail(&ap->link, &aliases_lookup); pr_debug("adding DT alias:%s: stem=%s id=%i node=%pOF\n", ap->alias, ap->stem, ap->id, np); } /** * of_alias_scan - Scan all properties of the 'aliases' node * @dt_alloc: An allocator that provides a virtual address to memory * for storing the resulting tree * * The function scans all the properties of the 'aliases' node and populates * the global lookup table with the properties. */ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)) { const struct property *pp; of_aliases = of_find_node_by_path("/aliases"); of_chosen = of_find_node_by_path("/chosen"); if (of_chosen == NULL) of_chosen = of_find_node_by_path("/chosen@0"); if (of_chosen) { /* linux,stdout-path and /aliases/stdout are for legacy compatibility */ const char *name = NULL; if (of_property_read_string(of_chosen, "stdout-path", &name)) of_property_read_string(of_chosen, "linux,stdout-path", &name); if (IS_ENABLED(CONFIG_PPC) && !name) of_property_read_string(of_aliases, "stdout", &name); if (name) of_stdout = of_find_node_opts_by_path(name, &of_stdout_options); if (of_stdout) of_stdout->fwnode.flags |= FWNODE_FLAG_BEST_EFFORT; } if (!of_aliases) return; for_each_property_of_node(of_aliases, pp) { const char *start = pp->name; const char *end = start + strlen(start); struct device_node *np; struct alias_prop *ap; int id, len; /* Skip those we do not want to proceed */ if (is_pseudo_property(pp->name)) continue; np = of_find_node_by_path(pp->value); if (!np) continue; /* walk the alias backwards to extract the id and work out * the 'stem' string */ while (isdigit(*(end-1)) && end > start) end--; len = end - start; if (kstrtoint(end, 10, &id) < 0) continue; /* Allocate an alias_prop with enough space for the stem */ ap = dt_alloc(sizeof(*ap) + len + 1, __alignof__(*ap)); if (!ap) continue; memset(ap, 0, sizeof(*ap) + len + 1); ap->alias = start; of_alias_add(ap, np, id, start, len); } } /** * of_alias_get_id - Get alias id for the given device_node * @np: Pointer to the given device_node * @stem: Alias stem of the given device_node * * The function travels the lookup table to get the alias id for the given * device_node and alias stem. * * Return: The alias id if found. */ int of_alias_get_id(const struct device_node *np, const char *stem) { struct alias_prop *app; int id = -ENODEV; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (strcmp(app->stem, stem) != 0) continue; if (np == app->np) { id = app->id; break; } } mutex_unlock(&of_mutex); return id; } EXPORT_SYMBOL_GPL(of_alias_get_id); /** * of_alias_get_highest_id - Get highest alias id for the given stem * @stem: Alias stem to be examined * * The function travels the lookup table to get the highest alias id for the * given alias stem. It returns the alias id if found. */ int of_alias_get_highest_id(const char *stem) { struct alias_prop *app; int id = -ENODEV; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (strcmp(app->stem, stem) != 0) continue; if (app->id > id) id = app->id; } mutex_unlock(&of_mutex); return id; } EXPORT_SYMBOL_GPL(of_alias_get_highest_id); /** * of_console_check() - Test and setup console for DT setup * @dn: Pointer to device node * @name: Name to use for preferred console without index. ex. "ttyS" * @index: Index to use for preferred console. * * Check if the given device node matches the stdout-path property in the * /chosen node. If it does then register it as the preferred console. * * Return: TRUE if console successfully setup. Otherwise return FALSE. */ bool of_console_check(const struct device_node *dn, char *name, int index) { if (!dn || dn != of_stdout || console_set_on_cmdline) return false; /* * XXX: cast `options' to char pointer to suppress complication * warnings: printk, UART and console drivers expect char pointer. */ return !add_preferred_console(name, index, (char *)of_stdout_options); } EXPORT_SYMBOL_GPL(of_console_check); /** * of_find_next_cache_node - Find a node's subsidiary cache * @np: node of type "cpu" or "cache" * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. Caller should hold a reference * to np. */ struct device_node *of_find_next_cache_node(const struct device_node *np) { struct device_node *child, *cache_node; cache_node = of_parse_phandle(np, "l2-cache", 0); if (!cache_node) cache_node = of_parse_phandle(np, "next-level-cache", 0); if (cache_node) return cache_node; /* OF on pmac has nodes instead of properties named "l2-cache" * beneath CPU nodes. */ if (IS_ENABLED(CONFIG_PPC_PMAC) && of_node_is_type(np, "cpu")) for_each_child_of_node(np, child) if (of_node_is_type(child, "cache")) return child; return NULL; } /** * of_find_last_cache_level - Find the level at which the last cache is * present for the given logical cpu * * @cpu: cpu number(logical index) for which the last cache level is needed * * Return: The level at which the last cache is present. It is exactly * same as the total number of cache levels for the given logical cpu. */ int of_find_last_cache_level(unsigned int cpu) { u32 cache_level = 0; struct device_node *prev = NULL, *np = of_cpu_device_node_get(cpu); while (np) { of_node_put(prev); prev = np; np = of_find_next_cache_node(np); } of_property_read_u32(prev, "cache-level", &cache_level); of_node_put(prev); return cache_level; } /** * of_map_id - Translate an ID through a downstream mapping. * @np: root complex device node. * @id: device ID to map. * @map_name: property name of the map to use. * @map_mask_name: optional property name of the mask to use. * @target: optional pointer to a target device node. * @id_out: optional pointer to receive the translated ID. * * Given a device ID, look up the appropriate implementation-defined * platform ID and/or the target device which receives transactions on that * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or * @id_out may be NULL if only the other is required. If @target points to * a non-NULL device node pointer, only entries targeting that node will be * matched; if it points to a NULL value, it will receive the device node of * the first matching target phandle, with a reference held. * * Return: 0 on success or a standard error code on failure. */ int of_map_id(const struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out) { u32 map_mask, masked_id; int map_len; const __be32 *map = NULL; if (!np || !map_name || (!target && !id_out)) return -EINVAL; map = of_get_property(np, map_name, &map_len); if (!map) { if (target) return -ENODEV; /* Otherwise, no map implies no translation */ *id_out = id; return 0; } if (!map_len || map_len % (4 * sizeof(*map))) { pr_err("%pOF: Error: Bad %s length: %d\n", np, map_name, map_len); return -EINVAL; } /* The default is to select all bits. */ map_mask = 0xffffffff; /* * Can be overridden by "{iommu,msi}-map-mask" property. * If of_property_read_u32() fails, the default is used. */ if (map_mask_name) of_property_read_u32(np, map_mask_name, &map_mask); masked_id = map_mask & id; for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) { struct device_node *phandle_node; u32 id_base = be32_to_cpup(map + 0); u32 phandle = be32_to_cpup(map + 1); u32 out_base = be32_to_cpup(map + 2); u32 id_len = be32_to_cpup(map + 3); if (id_base & ~map_mask) { pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores id-base (0x%x)\n", np, map_name, map_name, map_mask, id_base); return -EFAULT; } if (masked_id < id_base || masked_id >= id_base + id_len) continue; phandle_node = of_find_node_by_phandle(phandle); if (!phandle_node) return -ENODEV; if (target) { if (*target) of_node_put(phandle_node); else *target = phandle_node; if (*target != phandle_node) continue; } if (id_out) *id_out = masked_id - id_base + out_base; pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n", np, map_name, map_mask, id_base, out_base, id_len, id, masked_id - id_base + out_base); return 0; } pr_info("%pOF: no %s translation for id 0x%x on %pOF\n", np, map_name, id, target && *target ? *target : NULL); /* Bypasses translation */ if (id_out) *id_out = id; return 0; } EXPORT_SYMBOL_GPL(of_map_id);
597 598 597 597 1994 1969 596 598 598 1995 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/drivers/net/netconsole.c * * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> * * This file contains the implementation of an IRQ-safe, crash-safe * kernel console implementation that outputs kernel messages to the * network. * * Modification history: * * 2001-09-17 started by Ingo Molnar. * 2003-08-11 2.6 port by Matt Mackall * simplified options * generic card hooks * works non-modular * 2003-09-07 rewritten with netpoll api */ /**************************************************************** * ****************************************************************/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/console.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/netpoll.h> #include <linux/inet.h> #include <linux/configfs.h> #include <linux/etherdevice.h> #include <linux/u64_stats_sync.h> #include <linux/utsname.h> #include <linux/rtnetlink.h> MODULE_AUTHOR("Matt Mackall <mpm@selenic.com>"); MODULE_DESCRIPTION("Console driver for network interfaces"); MODULE_LICENSE("GPL"); #define MAX_PARAM_LENGTH 256 #define MAX_EXTRADATA_ENTRY_LEN 256 #define MAX_EXTRADATA_VALUE_LEN 200 /* The number 3 comes from userdata entry format characters (' ', '=', '\n') */ #define MAX_EXTRADATA_NAME_LEN (MAX_EXTRADATA_ENTRY_LEN - \ MAX_EXTRADATA_VALUE_LEN - 3) #define MAX_EXTRADATA_ITEMS 16 #define MAX_PRINT_CHUNK 1000 static char config[MAX_PARAM_LENGTH]; module_param_string(netconsole, config, MAX_PARAM_LENGTH, 0); MODULE_PARM_DESC(netconsole, " netconsole=[src-port]@[src-ip]/[dev],[tgt-port]@<tgt-ip>/[tgt-macaddr]"); static bool oops_only; module_param(oops_only, bool, 0600); MODULE_PARM_DESC(oops_only, "Only log oops messages"); #define NETCONSOLE_PARAM_TARGET_PREFIX "cmdline" #ifndef MODULE static int __init option_setup(char *opt) { strscpy(config, opt, MAX_PARAM_LENGTH); return 1; } __setup("netconsole=", option_setup); #endif /* MODULE */ /* Linked list of all configured targets */ static LIST_HEAD(target_list); /* target_cleanup_list is used to track targets that need to be cleaned outside * of target_list_lock. It should be cleaned in the same function it is * populated. */ static LIST_HEAD(target_cleanup_list); /* This needs to be a spinlock because write_msg() cannot sleep */ static DEFINE_SPINLOCK(target_list_lock); /* This needs to be a mutex because netpoll_cleanup might sleep */ static DEFINE_MUTEX(target_cleanup_list_lock); /* * Console driver for netconsoles. Register only consoles that have * an associated target of the same type. */ static struct console netconsole_ext, netconsole; struct netconsole_target_stats { u64_stats_t xmit_drop_count; u64_stats_t enomem_count; struct u64_stats_sync syncp; }; enum console_type { CONS_BASIC = BIT(0), CONS_EXTENDED = BIT(1), }; /* Features enabled in sysdata. Contrary to userdata, this data is populated by * the kernel. The fields are designed as bitwise flags, allowing multiple * features to be set in sysdata_fields. */ enum sysdata_feature { /* Populate the CPU that sends the message */ SYSDATA_CPU_NR = BIT(0), /* Populate the task name (as in current->comm) in sysdata */ SYSDATA_TASKNAME = BIT(1), /* Kernel release/version as part of sysdata */ SYSDATA_RELEASE = BIT(2), /* Include a per-target message ID as part of sysdata */ SYSDATA_MSGID = BIT(3), }; /** * struct netconsole_target - Represents a configured netconsole target. * @list: Links this target into the target_list. * @group: Links us into the configfs subsystem hierarchy. * @userdata_group: Links to the userdata configfs hierarchy * @extradata_complete: Cached, formatted string of append * @userdata_length: String length of usedata in extradata_complete. * @sysdata_fields: Sysdata features enabled. * @msgcounter: Message sent counter. * @stats: Packet send stats for the target. Used for debugging. * @enabled: On / off knob to enable / disable target. * Visible from userspace (read-write). * We maintain a strict 1:1 correspondence between this and * whether the corresponding netpoll is active or inactive. * Also, other parameters of a target may be modified at * runtime only when it is disabled (enabled == 0). * @extended: Denotes whether console is extended or not. * @release: Denotes whether kernel release version should be prepended * to the message. Depends on extended console. * @np: The netpoll structure for this target. * Contains the other userspace visible parameters: * dev_name (read-write) * local_port (read-write) * remote_port (read-write) * local_ip (read-write) * remote_ip (read-write) * local_mac (read-only) * remote_mac (read-write) * @buf: The buffer used to send the full msg to the network stack */ struct netconsole_target { struct list_head list; #ifdef CONFIG_NETCONSOLE_DYNAMIC struct config_group group; struct config_group userdata_group; char extradata_complete[MAX_EXTRADATA_ENTRY_LEN * MAX_EXTRADATA_ITEMS]; size_t userdata_length; /* bit-wise with sysdata_feature bits */ u32 sysdata_fields; /* protected by target_list_lock */ u32 msgcounter; #endif struct netconsole_target_stats stats; bool enabled; bool extended; bool release; struct netpoll np; /* protected by target_list_lock */ char buf[MAX_PRINT_CHUNK]; }; #ifdef CONFIG_NETCONSOLE_DYNAMIC static struct configfs_subsystem netconsole_subsys; static DEFINE_MUTEX(dynamic_netconsole_mutex); static int __init dynamic_netconsole_init(void) { config_group_init(&netconsole_subsys.su_group); mutex_init(&netconsole_subsys.su_mutex); return configfs_register_subsystem(&netconsole_subsys); } static void __exit dynamic_netconsole_exit(void) { configfs_unregister_subsystem(&netconsole_subsys); } /* * Targets that were created by parsing the boot/module option string * do not exist in the configfs hierarchy (and have NULL names) and will * never go away, so make these a no-op for them. */ static void netconsole_target_get(struct netconsole_target *nt) { if (config_item_name(&nt->group.cg_item)) config_group_get(&nt->group); } static void netconsole_target_put(struct netconsole_target *nt) { if (config_item_name(&nt->group.cg_item)) config_group_put(&nt->group); } #else /* !CONFIG_NETCONSOLE_DYNAMIC */ static int __init dynamic_netconsole_init(void) { return 0; } static void __exit dynamic_netconsole_exit(void) { } /* * No danger of targets going away from under us when dynamic * reconfigurability is off. */ static void netconsole_target_get(struct netconsole_target *nt) { } static void netconsole_target_put(struct netconsole_target *nt) { } static void populate_configfs_item(struct netconsole_target *nt, int cmdline_count) { } #endif /* CONFIG_NETCONSOLE_DYNAMIC */ /* Allocate and initialize with defaults. * Note that these targets get their config_item fields zeroed-out. */ static struct netconsole_target *alloc_and_init(void) { struct netconsole_target *nt; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return nt; if (IS_ENABLED(CONFIG_NETCONSOLE_EXTENDED_LOG)) nt->extended = true; if (IS_ENABLED(CONFIG_NETCONSOLE_PREPEND_RELEASE)) nt->release = true; nt->np.name = "netconsole"; strscpy(nt->np.dev_name, "eth0", IFNAMSIZ); nt->np.local_port = 6665; nt->np.remote_port = 6666; eth_broadcast_addr(nt->np.remote_mac); return nt; } /* Clean up every target in the cleanup_list and move the clean targets back to * the main target_list. */ static void netconsole_process_cleanups_core(void) { struct netconsole_target *nt, *tmp; unsigned long flags; /* The cleanup needs RTNL locked */ ASSERT_RTNL(); mutex_lock(&target_cleanup_list_lock); list_for_each_entry_safe(nt, tmp, &target_cleanup_list, list) { /* all entries in the cleanup_list needs to be disabled */ WARN_ON_ONCE(nt->enabled); do_netpoll_cleanup(&nt->np); /* moved the cleaned target to target_list. Need to hold both * locks */ spin_lock_irqsave(&target_list_lock, flags); list_move(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); } WARN_ON_ONCE(!list_empty(&target_cleanup_list)); mutex_unlock(&target_cleanup_list_lock); } static void netconsole_print_banner(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); if (np->ipv6) np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); else np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); np_info(np, "interface name '%s'\n", np->dev_name); np_info(np, "local ethernet address '%pM'\n", np->dev_mac); np_info(np, "remote port %d\n", np->remote_port); if (np->ipv6) np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); else np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); np_info(np, "remote ethernet address %pM\n", np->remote_mac); } /* Parse the string and populate the `inet_addr` union. Return 0 if IPv4 is * populated, 1 if IPv6 is populated, and -1 upon failure. */ static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) { const char *end = NULL; int len; len = strlen(str); if (!len) return -1; if (str[len - 1] == '\n') len -= 1; if (in4_pton(str, len, (void *)addr, -1, &end) > 0 && (!end || *end == 0 || *end == '\n')) return 0; if (IS_ENABLED(CONFIG_IPV6) && in6_pton(str, len, (void *)addr, -1, &end) > 0 && (!end || *end == 0 || *end == '\n')) return 1; return -1; } #ifdef CONFIG_NETCONSOLE_DYNAMIC /* * Our subsystem hierarchy is: * * /sys/kernel/config/netconsole/ * | * <target>/ * | enabled * | release * | dev_name * | local_port * | remote_port * | local_ip * | remote_ip * | local_mac * | remote_mac * | transmit_errors * | userdata/ * | <key>/ * | value * | ... * | * <target>/... */ static struct netconsole_target *to_target(struct config_item *item) { struct config_group *cfg_group; cfg_group = to_config_group(item); if (!cfg_group) return NULL; return container_of(to_config_group(item), struct netconsole_target, group); } /* Do the list cleanup with the rtnl lock hold. rtnl lock is necessary because * netdev might be cleaned-up by calling __netpoll_cleanup(), */ static void netconsole_process_cleanups(void) { /* rtnl lock is called here, because it has precedence over * target_cleanup_list_lock mutex and target_cleanup_list */ rtnl_lock(); netconsole_process_cleanups_core(); rtnl_unlock(); } /* Get rid of possible trailing newline, returning the new length */ static void trim_newline(char *s, size_t maxlen) { size_t len; len = strnlen(s, maxlen); if (s[len - 1] == '\n') s[len - 1] = '\0'; } /* * Attribute operations for netconsole_target. */ static ssize_t enabled_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->enabled); } static ssize_t extended_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->extended); } static ssize_t release_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->release); } static ssize_t dev_name_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%s\n", to_target(item)->np.dev_name); } static ssize_t local_port_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->np.local_port); } static ssize_t remote_port_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->np.remote_port); } static ssize_t local_ip_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); if (nt->np.ipv6) return sysfs_emit(buf, "%pI6c\n", &nt->np.local_ip.in6); else return sysfs_emit(buf, "%pI4\n", &nt->np.local_ip); } static ssize_t remote_ip_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); if (nt->np.ipv6) return sysfs_emit(buf, "%pI6c\n", &nt->np.remote_ip.in6); else return sysfs_emit(buf, "%pI4\n", &nt->np.remote_ip); } static ssize_t local_mac_show(struct config_item *item, char *buf) { struct net_device *dev = to_target(item)->np.dev; static const u8 bcast[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; return sysfs_emit(buf, "%pM\n", dev ? dev->dev_addr : bcast); } static ssize_t remote_mac_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%pM\n", to_target(item)->np.remote_mac); } static ssize_t transmit_errors_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); u64 xmit_drop_count, enomem_count; unsigned int start; do { start = u64_stats_fetch_begin(&nt->stats.syncp); xmit_drop_count = u64_stats_read(&nt->stats.xmit_drop_count); enomem_count = u64_stats_read(&nt->stats.enomem_count); } while (u64_stats_fetch_retry(&nt->stats.syncp, start)); return sysfs_emit(buf, "%llu\n", xmit_drop_count + enomem_count); } /* configfs helper to display if cpu_nr sysdata feature is enabled */ static ssize_t sysdata_cpu_nr_enabled_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item->ci_parent); bool cpu_nr_enabled; mutex_lock(&dynamic_netconsole_mutex); cpu_nr_enabled = !!(nt->sysdata_fields & SYSDATA_CPU_NR); mutex_unlock(&dynamic_netconsole_mutex); return sysfs_emit(buf, "%d\n", cpu_nr_enabled); } /* configfs helper to display if taskname sysdata feature is enabled */ static ssize_t sysdata_taskname_enabled_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item->ci_parent); bool taskname_enabled; mutex_lock(&dynamic_netconsole_mutex); taskname_enabled = !!(nt->sysdata_fields & SYSDATA_TASKNAME); mutex_unlock(&dynamic_netconsole_mutex); return sysfs_emit(buf, "%d\n", taskname_enabled); } static ssize_t sysdata_release_enabled_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item->ci_parent); bool release_enabled; mutex_lock(&dynamic_netconsole_mutex); release_enabled = !!(nt->sysdata_fields & SYSDATA_TASKNAME); mutex_unlock(&dynamic_netconsole_mutex); return sysfs_emit(buf, "%d\n", release_enabled); } /* Iterate in the list of target, and make sure we don't have any console * register without targets of the same type */ static void unregister_netcons_consoles(void) { struct netconsole_target *nt; u32 console_type_needed = 0; unsigned long flags; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) { if (nt->extended) console_type_needed |= CONS_EXTENDED; else console_type_needed |= CONS_BASIC; } spin_unlock_irqrestore(&target_list_lock, flags); if (!(console_type_needed & CONS_EXTENDED) && console_is_registered(&netconsole_ext)) unregister_console(&netconsole_ext); if (!(console_type_needed & CONS_BASIC) && console_is_registered(&netconsole)) unregister_console(&netconsole); } static ssize_t sysdata_msgid_enabled_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item->ci_parent); bool msgid_enabled; mutex_lock(&dynamic_netconsole_mutex); msgid_enabled = !!(nt->sysdata_fields & SYSDATA_MSGID); mutex_unlock(&dynamic_netconsole_mutex); return sysfs_emit(buf, "%d\n", msgid_enabled); } /* * This one is special -- targets created through the configfs interface * are not enabled (and the corresponding netpoll activated) by default. * The user is expected to set the desired parameters first (which * would enable him to dynamically add new netpoll targets for new * network interfaces as and when they come up). */ static ssize_t enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); unsigned long flags; bool enabled; ssize_t ret; mutex_lock(&dynamic_netconsole_mutex); ret = kstrtobool(buf, &enabled); if (ret) goto out_unlock; ret = -EINVAL; if (enabled == nt->enabled) { pr_info("network logging has already %s\n", nt->enabled ? "started" : "stopped"); goto out_unlock; } if (enabled) { /* true */ if (nt->release && !nt->extended) { pr_err("Not enabling netconsole. Release feature requires extended log message"); goto out_unlock; } if (nt->extended && !console_is_registered(&netconsole_ext)) { netconsole_ext.flags |= CON_ENABLED; register_console(&netconsole_ext); } /* User might be enabling the basic format target for the very * first time, make sure the console is registered. */ if (!nt->extended && !console_is_registered(&netconsole)) { netconsole.flags |= CON_ENABLED; register_console(&netconsole); } /* * Skip netconsole_parser_cmdline() -- all the attributes are * already configured via configfs. Just print them out. */ netconsole_print_banner(&nt->np); ret = netpoll_setup(&nt->np); if (ret) goto out_unlock; nt->enabled = true; pr_info("network logging started\n"); } else { /* false */ /* We need to disable the netconsole before cleaning it up * otherwise we might end up in write_msg() with * nt->np.dev == NULL and nt->enabled == true */ mutex_lock(&target_cleanup_list_lock); spin_lock_irqsave(&target_list_lock, flags); nt->enabled = false; /* Remove the target from the list, while holding * target_list_lock */ list_move(&nt->list, &target_cleanup_list); spin_unlock_irqrestore(&target_list_lock, flags); mutex_unlock(&target_cleanup_list_lock); /* Unregister consoles, whose the last target of that type got * disabled. */ unregister_netcons_consoles(); } ret = strnlen(buf, count); /* Deferred cleanup */ netconsole_process_cleanups(); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t release_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); bool release; ssize_t ret; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); ret = -EINVAL; goto out_unlock; } ret = kstrtobool(buf, &release); if (ret) goto out_unlock; nt->release = release; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t extended_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); bool extended; ssize_t ret; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); ret = -EINVAL; goto out_unlock; } ret = kstrtobool(buf, &extended); if (ret) goto out_unlock; nt->extended = extended; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t dev_name_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); mutex_unlock(&dynamic_netconsole_mutex); return -EINVAL; } strscpy(nt->np.dev_name, buf, IFNAMSIZ); trim_newline(nt->np.dev_name, IFNAMSIZ); mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); } static ssize_t local_port_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } ret = kstrtou16(buf, 10, &nt->np.local_port); if (ret < 0) goto out_unlock; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t remote_port_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } ret = kstrtou16(buf, 10, &nt->np.remote_port); if (ret < 0) goto out_unlock; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t local_ip_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; int ipv6; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } ipv6 = netpoll_parse_ip_addr(buf, &nt->np.local_ip); if (ipv6 == -1) goto out_unlock; nt->np.ipv6 = !!ipv6; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t remote_ip_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; int ipv6; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } ipv6 = netpoll_parse_ip_addr(buf, &nt->np.remote_ip); if (ipv6 == -1) goto out_unlock; nt->np.ipv6 = !!ipv6; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } /* Count number of entries we have in extradata. * This is important because the extradata_complete only supports * MAX_EXTRADATA_ITEMS entries. Before enabling any new {user,sys}data * feature, number of entries needs to checked for available space. */ static size_t count_extradata_entries(struct netconsole_target *nt) { size_t entries; /* Userdata entries */ entries = list_count_nodes(&nt->userdata_group.cg_children); /* Plus sysdata entries */ if (nt->sysdata_fields & SYSDATA_CPU_NR) entries += 1; if (nt->sysdata_fields & SYSDATA_TASKNAME) entries += 1; if (nt->sysdata_fields & SYSDATA_RELEASE) entries += 1; if (nt->sysdata_fields & SYSDATA_MSGID) entries += 1; return entries; } static ssize_t remote_mac_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); u8 remote_mac[ETH_ALEN]; ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } if (!mac_pton(buf, remote_mac)) goto out_unlock; if (buf[MAC_ADDR_STR_LEN] && buf[MAC_ADDR_STR_LEN] != '\n') goto out_unlock; memcpy(nt->np.remote_mac, remote_mac, ETH_ALEN); ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } struct userdatum { struct config_item item; char value[MAX_EXTRADATA_VALUE_LEN]; }; static struct userdatum *to_userdatum(struct config_item *item) { return container_of(item, struct userdatum, item); } struct userdata { struct config_group group; }; static struct userdata *to_userdata(struct config_item *item) { return container_of(to_config_group(item), struct userdata, group); } static struct netconsole_target *userdata_to_target(struct userdata *ud) { struct config_group *netconsole_group; netconsole_group = to_config_group(ud->group.cg_item.ci_parent); return to_target(&netconsole_group->cg_item); } static ssize_t userdatum_value_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%s\n", &(to_userdatum(item)->value[0])); } static void update_userdata(struct netconsole_target *nt) { int complete_idx = 0, child_count = 0; struct list_head *entry; /* Clear the current string in case the last userdatum was deleted */ nt->userdata_length = 0; nt->extradata_complete[0] = 0; list_for_each(entry, &nt->userdata_group.cg_children) { struct userdatum *udm_item; struct config_item *item; if (WARN_ON_ONCE(child_count >= MAX_EXTRADATA_ITEMS)) break; child_count++; item = container_of(entry, struct config_item, ci_entry); udm_item = to_userdatum(item); /* Skip userdata with no value set */ if (strnlen(udm_item->value, MAX_EXTRADATA_VALUE_LEN) == 0) continue; /* This doesn't overflow extradata_complete since it will write * one entry length (1/MAX_EXTRADATA_ITEMS long), entry count is * checked to not exceed MAX items with child_count above */ complete_idx += scnprintf(&nt->extradata_complete[complete_idx], MAX_EXTRADATA_ENTRY_LEN, " %s=%s\n", item->ci_name, udm_item->value); } nt->userdata_length = strnlen(nt->extradata_complete, sizeof(nt->extradata_complete)); } static ssize_t userdatum_value_store(struct config_item *item, const char *buf, size_t count) { struct userdatum *udm = to_userdatum(item); struct netconsole_target *nt; struct userdata *ud; ssize_t ret; if (count > MAX_EXTRADATA_VALUE_LEN) return -EMSGSIZE; mutex_lock(&dynamic_netconsole_mutex); ret = strscpy(udm->value, buf, sizeof(udm->value)); if (ret < 0) goto out_unlock; trim_newline(udm->value, sizeof(udm->value)); ud = to_userdata(item->ci_parent); nt = userdata_to_target(ud); update_userdata(nt); ret = count; out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } /* disable_sysdata_feature - Disable sysdata feature and clean sysdata * @nt: target that is disabling the feature * @feature: feature being disabled */ static void disable_sysdata_feature(struct netconsole_target *nt, enum sysdata_feature feature) { nt->sysdata_fields &= ~feature; nt->extradata_complete[nt->userdata_length] = 0; } static ssize_t sysdata_msgid_enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item->ci_parent); bool msgid_enabled, curr; ssize_t ret; ret = kstrtobool(buf, &msgid_enabled); if (ret) return ret; mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_MSGID); if (msgid_enabled == curr) goto unlock_ok; if (msgid_enabled && count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) { ret = -ENOSPC; goto unlock; } if (msgid_enabled) nt->sysdata_fields |= SYSDATA_MSGID; else disable_sysdata_feature(nt, SYSDATA_MSGID); unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t sysdata_release_enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item->ci_parent); bool release_enabled, curr; ssize_t ret; ret = kstrtobool(buf, &release_enabled); if (ret) return ret; mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_RELEASE); if (release_enabled == curr) goto unlock_ok; if (release_enabled && count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) { ret = -ENOSPC; goto unlock; } if (release_enabled) nt->sysdata_fields |= SYSDATA_RELEASE; else disable_sysdata_feature(nt, SYSDATA_RELEASE); unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t sysdata_taskname_enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item->ci_parent); bool taskname_enabled, curr; ssize_t ret; ret = kstrtobool(buf, &taskname_enabled); if (ret) return ret; mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_TASKNAME); if (taskname_enabled == curr) goto unlock_ok; if (taskname_enabled && count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) { ret = -ENOSPC; goto unlock; } if (taskname_enabled) nt->sysdata_fields |= SYSDATA_TASKNAME; else disable_sysdata_feature(nt, SYSDATA_TASKNAME); unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } /* configfs helper to sysdata cpu_nr feature */ static ssize_t sysdata_cpu_nr_enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item->ci_parent); bool cpu_nr_enabled, curr; ssize_t ret; ret = kstrtobool(buf, &cpu_nr_enabled); if (ret) return ret; mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_CPU_NR); if (cpu_nr_enabled == curr) /* no change requested */ goto unlock_ok; if (cpu_nr_enabled && count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) { /* user wants the new feature, but there is no space in the * buffer. */ ret = -ENOSPC; goto unlock; } if (cpu_nr_enabled) nt->sysdata_fields |= SYSDATA_CPU_NR; else /* This is special because extradata_complete might have * remaining data from previous sysdata, and it needs to be * cleaned. */ disable_sysdata_feature(nt, SYSDATA_CPU_NR); unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } CONFIGFS_ATTR(userdatum_, value); CONFIGFS_ATTR(sysdata_, cpu_nr_enabled); CONFIGFS_ATTR(sysdata_, taskname_enabled); CONFIGFS_ATTR(sysdata_, release_enabled); CONFIGFS_ATTR(sysdata_, msgid_enabled); static struct configfs_attribute *userdatum_attrs[] = { &userdatum_attr_value, NULL, }; static void userdatum_release(struct config_item *item) { kfree(to_userdatum(item)); } static struct configfs_item_operations userdatum_ops = { .release = userdatum_release, }; static const struct config_item_type userdatum_type = { .ct_item_ops = &userdatum_ops, .ct_attrs = userdatum_attrs, .ct_owner = THIS_MODULE, }; static struct config_item *userdatum_make_item(struct config_group *group, const char *name) { struct netconsole_target *nt; struct userdatum *udm; struct userdata *ud; if (strlen(name) > MAX_EXTRADATA_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); ud = to_userdata(&group->cg_item); nt = userdata_to_target(ud); if (count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) return ERR_PTR(-ENOSPC); udm = kzalloc(sizeof(*udm), GFP_KERNEL); if (!udm) return ERR_PTR(-ENOMEM); config_item_init_type_name(&udm->item, name, &userdatum_type); return &udm->item; } static void userdatum_drop(struct config_group *group, struct config_item *item) { struct netconsole_target *nt; struct userdata *ud; ud = to_userdata(&group->cg_item); nt = userdata_to_target(ud); mutex_lock(&dynamic_netconsole_mutex); update_userdata(nt); config_item_put(item); mutex_unlock(&dynamic_netconsole_mutex); } static struct configfs_attribute *userdata_attrs[] = { &sysdata_attr_cpu_nr_enabled, &sysdata_attr_taskname_enabled, &sysdata_attr_release_enabled, &sysdata_attr_msgid_enabled, NULL, }; static struct configfs_group_operations userdata_ops = { .make_item = userdatum_make_item, .drop_item = userdatum_drop, }; static const struct config_item_type userdata_type = { .ct_item_ops = &userdatum_ops, .ct_group_ops = &userdata_ops, .ct_attrs = userdata_attrs, .ct_owner = THIS_MODULE, }; CONFIGFS_ATTR(, enabled); CONFIGFS_ATTR(, extended); CONFIGFS_ATTR(, dev_name); CONFIGFS_ATTR(, local_port); CONFIGFS_ATTR(, remote_port); CONFIGFS_ATTR(, local_ip); CONFIGFS_ATTR(, remote_ip); CONFIGFS_ATTR_RO(, local_mac); CONFIGFS_ATTR(, remote_mac); CONFIGFS_ATTR(, release); CONFIGFS_ATTR_RO(, transmit_errors); static struct configfs_attribute *netconsole_target_attrs[] = { &attr_enabled, &attr_extended, &attr_release, &attr_dev_name, &attr_local_port, &attr_remote_port, &attr_local_ip, &attr_remote_ip, &attr_local_mac, &attr_remote_mac, &attr_transmit_errors, NULL, }; /* * Item operations and type for netconsole_target. */ static void netconsole_target_release(struct config_item *item) { kfree(to_target(item)); } static struct configfs_item_operations netconsole_target_item_ops = { .release = netconsole_target_release, }; static const struct config_item_type netconsole_target_type = { .ct_attrs = netconsole_target_attrs, .ct_item_ops = &netconsole_target_item_ops, .ct_owner = THIS_MODULE, }; static void init_target_config_group(struct netconsole_target *nt, const char *name) { config_group_init_type_name(&nt->group, name, &netconsole_target_type); config_group_init_type_name(&nt->userdata_group, "userdata", &userdata_type); configfs_add_default_group(&nt->userdata_group, &nt->group); } static struct netconsole_target *find_cmdline_target(const char *name) { struct netconsole_target *nt, *ret = NULL; unsigned long flags; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) { if (!strcmp(nt->group.cg_item.ci_name, name)) { ret = nt; break; } } spin_unlock_irqrestore(&target_list_lock, flags); return ret; } /* * Group operations and type for netconsole_subsys. */ static struct config_group *make_netconsole_target(struct config_group *group, const char *name) { struct netconsole_target *nt; unsigned long flags; /* Checking if a target by this name was created at boot time. If so, * attach a configfs entry to that target. This enables dynamic * control. */ if (!strncmp(name, NETCONSOLE_PARAM_TARGET_PREFIX, strlen(NETCONSOLE_PARAM_TARGET_PREFIX))) { nt = find_cmdline_target(name); if (nt) { init_target_config_group(nt, name); return &nt->group; } } nt = alloc_and_init(); if (!nt) return ERR_PTR(-ENOMEM); /* Initialize the config_group member */ init_target_config_group(nt, name); /* Adding, but it is disabled */ spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); return &nt->group; } static void drop_netconsole_target(struct config_group *group, struct config_item *item) { unsigned long flags; struct netconsole_target *nt = to_target(item); spin_lock_irqsave(&target_list_lock, flags); list_del(&nt->list); spin_unlock_irqrestore(&target_list_lock, flags); /* * The target may have never been enabled, or was manually disabled * before being removed so netpoll may have already been cleaned up. */ if (nt->enabled) netpoll_cleanup(&nt->np); config_item_put(&nt->group.cg_item); } static struct configfs_group_operations netconsole_subsys_group_ops = { .make_group = make_netconsole_target, .drop_item = drop_netconsole_target, }; static const struct config_item_type netconsole_subsys_type = { .ct_group_ops = &netconsole_subsys_group_ops, .ct_owner = THIS_MODULE, }; /* The netconsole configfs subsystem */ static struct configfs_subsystem netconsole_subsys = { .su_group = { .cg_item = { .ci_namebuf = "netconsole", .ci_type = &netconsole_subsys_type, }, }, }; static void populate_configfs_item(struct netconsole_target *nt, int cmdline_count) { char target_name[16]; snprintf(target_name, sizeof(target_name), "%s%d", NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count); init_target_config_group(nt, target_name); } static int sysdata_append_cpu_nr(struct netconsole_target *nt, int offset) { /* Append cpu=%d at extradata_complete after userdata str */ return scnprintf(&nt->extradata_complete[offset], MAX_EXTRADATA_ENTRY_LEN, " cpu=%u\n", raw_smp_processor_id()); } static int sysdata_append_taskname(struct netconsole_target *nt, int offset) { return scnprintf(&nt->extradata_complete[offset], MAX_EXTRADATA_ENTRY_LEN, " taskname=%s\n", current->comm); } static int sysdata_append_release(struct netconsole_target *nt, int offset) { return scnprintf(&nt->extradata_complete[offset], MAX_EXTRADATA_ENTRY_LEN, " release=%s\n", init_utsname()->release); } static int sysdata_append_msgid(struct netconsole_target *nt, int offset) { wrapping_assign_add(nt->msgcounter, 1); return scnprintf(&nt->extradata_complete[offset], MAX_EXTRADATA_ENTRY_LEN, " msgid=%u\n", nt->msgcounter); } /* * prepare_extradata - append sysdata at extradata_complete in runtime * @nt: target to send message to */ static int prepare_extradata(struct netconsole_target *nt) { int extradata_len; /* userdata was appended when configfs write helper was called * by update_userdata(). */ extradata_len = nt->userdata_length; if (!nt->sysdata_fields) goto out; if (nt->sysdata_fields & SYSDATA_CPU_NR) extradata_len += sysdata_append_cpu_nr(nt, extradata_len); if (nt->sysdata_fields & SYSDATA_TASKNAME) extradata_len += sysdata_append_taskname(nt, extradata_len); if (nt->sysdata_fields & SYSDATA_RELEASE) extradata_len += sysdata_append_release(nt, extradata_len); if (nt->sysdata_fields & SYSDATA_MSGID) extradata_len += sysdata_append_msgid(nt, extradata_len); WARN_ON_ONCE(extradata_len > MAX_EXTRADATA_ENTRY_LEN * MAX_EXTRADATA_ITEMS); out: return extradata_len; } #else /* CONFIG_NETCONSOLE_DYNAMIC not set */ static int prepare_extradata(struct netconsole_target *nt) { return 0; } #endif /* CONFIG_NETCONSOLE_DYNAMIC */ /* Handle network interface device notifications */ static int netconsole_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { unsigned long flags; struct netconsole_target *nt, *tmp; struct net_device *dev = netdev_notifier_info_to_dev(ptr); bool stopped = false; if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER || event == NETDEV_RELEASE || event == NETDEV_JOIN)) goto done; mutex_lock(&target_cleanup_list_lock); spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry_safe(nt, tmp, &target_list, list) { netconsole_target_get(nt); if (nt->np.dev == dev) { switch (event) { case NETDEV_CHANGENAME: strscpy(nt->np.dev_name, dev->name, IFNAMSIZ); break; case NETDEV_RELEASE: case NETDEV_JOIN: case NETDEV_UNREGISTER: nt->enabled = false; list_move(&nt->list, &target_cleanup_list); stopped = true; } } netconsole_target_put(nt); } spin_unlock_irqrestore(&target_list_lock, flags); mutex_unlock(&target_cleanup_list_lock); if (stopped) { const char *msg = "had an event"; switch (event) { case NETDEV_UNREGISTER: msg = "unregistered"; break; case NETDEV_RELEASE: msg = "released slaves"; break; case NETDEV_JOIN: msg = "is joining a master device"; break; } pr_info("network logging stopped on interface %s as it %s\n", dev->name, msg); } /* Process target_cleanup_list entries. By the end, target_cleanup_list * should be empty */ netconsole_process_cleanups_core(); done: return NOTIFY_DONE; } static struct notifier_block netconsole_netdev_notifier = { .notifier_call = netconsole_netdev_event, }; /** * send_udp - Wrapper for netpoll_send_udp that counts errors * @nt: target to send message to * @msg: message to send * @len: length of message * * Calls netpoll_send_udp and classifies the return value. If an error * occurred it increments statistics in nt->stats accordingly. * Only calls netpoll_send_udp if CONFIG_NETCONSOLE_DYNAMIC is disabled. */ static void send_udp(struct netconsole_target *nt, const char *msg, int len) { int result = netpoll_send_udp(&nt->np, msg, len); if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) { if (result == NET_XMIT_DROP) { u64_stats_update_begin(&nt->stats.syncp); u64_stats_inc(&nt->stats.xmit_drop_count); u64_stats_update_end(&nt->stats.syncp); } else if (result == -ENOMEM) { u64_stats_update_begin(&nt->stats.syncp); u64_stats_inc(&nt->stats.enomem_count); u64_stats_update_end(&nt->stats.syncp); } } } static void send_msg_no_fragmentation(struct netconsole_target *nt, const char *msg, int msg_len, int release_len) { const char *extradata = NULL; const char *release; #ifdef CONFIG_NETCONSOLE_DYNAMIC extradata = nt->extradata_complete; #endif if (release_len) { release = init_utsname()->release; scnprintf(nt->buf, MAX_PRINT_CHUNK, "%s,%s", release, msg); msg_len += release_len; } else { memcpy(nt->buf, msg, msg_len); } if (extradata) msg_len += scnprintf(&nt->buf[msg_len], MAX_PRINT_CHUNK - msg_len, "%s", extradata); send_udp(nt, nt->buf, msg_len); } static void append_release(char *buf) { const char *release; release = init_utsname()->release; scnprintf(buf, MAX_PRINT_CHUNK, "%s,", release); } static void send_fragmented_body(struct netconsole_target *nt, const char *msgbody, int header_len, int msgbody_len, int extradata_len) { int sent_extradata, preceding_bytes; const char *extradata = NULL; int body_len, offset = 0; #ifdef CONFIG_NETCONSOLE_DYNAMIC extradata = nt->extradata_complete; #endif /* body_len represents the number of bytes that will be sent. This is * bigger than MAX_PRINT_CHUNK, thus, it will be split in multiple * packets */ body_len = msgbody_len + extradata_len; /* In each iteration of the while loop below, we send a packet * containing the header and a portion of the body. The body is * composed of two parts: msgbody and extradata. We keep track of how * many bytes have been sent so far using the offset variable, which * ranges from 0 to the total length of the body. */ while (offset < body_len) { int this_header = header_len; bool msgbody_written = false; int this_offset = 0; int this_chunk = 0; this_header += scnprintf(nt->buf + this_header, MAX_PRINT_CHUNK - this_header, ",ncfrag=%d/%d;", offset, body_len); /* Not all msgbody data has been written yet */ if (offset < msgbody_len) { this_chunk = min(msgbody_len - offset, MAX_PRINT_CHUNK - this_header); if (WARN_ON_ONCE(this_chunk <= 0)) return; memcpy(nt->buf + this_header, msgbody + offset, this_chunk); this_offset += this_chunk; } /* msgbody was finally written, either in the previous * messages and/or in the current buf. Time to write * the extradata. */ msgbody_written |= offset + this_offset >= msgbody_len; /* Msg body is fully written and there is pending extradata to * write, append extradata in this chunk */ if (msgbody_written && offset + this_offset < body_len) { /* Track how much user data was already sent. First * time here, sent_userdata is zero */ sent_extradata = (offset + this_offset) - msgbody_len; /* offset of bytes used in current buf */ preceding_bytes = this_chunk + this_header; if (WARN_ON_ONCE(sent_extradata < 0)) return; this_chunk = min(extradata_len - sent_extradata, MAX_PRINT_CHUNK - preceding_bytes); if (WARN_ON_ONCE(this_chunk < 0)) /* this_chunk could be zero if all the previous * message used all the buffer. This is not a * problem, extradata will be sent in the next * iteration */ return; memcpy(nt->buf + this_header + this_offset, extradata + sent_extradata, this_chunk); this_offset += this_chunk; } send_udp(nt, nt->buf, this_header + this_offset); offset += this_offset; } } static void send_msg_fragmented(struct netconsole_target *nt, const char *msg, int msg_len, int release_len, int extradata_len) { int header_len, msgbody_len; const char *msgbody; /* need to insert extra header fields, detect header and msgbody */ msgbody = memchr(msg, ';', msg_len); if (WARN_ON_ONCE(!msgbody)) return; header_len = msgbody - msg; msgbody_len = msg_len - header_len - 1; msgbody++; /* * Transfer multiple chunks with the following extra header. * "ncfrag=<byte-offset>/<total-bytes>" */ if (release_len) append_release(nt->buf); /* Copy the header into the buffer */ memcpy(nt->buf + release_len, msg, header_len); header_len += release_len; /* for now on, the header will be persisted, and the msgbody * will be replaced */ send_fragmented_body(nt, msgbody, header_len, msgbody_len, extradata_len); } /** * send_ext_msg_udp - send extended log message to target * @nt: target to send message to * @msg: extended log message to send * @msg_len: length of message * * Transfer extended log @msg to @nt. If @msg is longer than * MAX_PRINT_CHUNK, it'll be split and transmitted in multiple chunks with * ncfrag header field added to identify them. */ static void send_ext_msg_udp(struct netconsole_target *nt, const char *msg, int msg_len) { int release_len = 0; int extradata_len; extradata_len = prepare_extradata(nt); if (nt->release) release_len = strlen(init_utsname()->release) + 1; if (msg_len + release_len + extradata_len <= MAX_PRINT_CHUNK) return send_msg_no_fragmentation(nt, msg, msg_len, release_len); return send_msg_fragmented(nt, msg, msg_len, release_len, extradata_len); } static void write_ext_msg(struct console *con, const char *msg, unsigned int len) { struct netconsole_target *nt; unsigned long flags; if ((oops_only && !oops_in_progress) || list_empty(&target_list)) return; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) if (nt->extended && nt->enabled && netif_running(nt->np.dev)) send_ext_msg_udp(nt, msg, len); spin_unlock_irqrestore(&target_list_lock, flags); } static void write_msg(struct console *con, const char *msg, unsigned int len) { int frag, left; unsigned long flags; struct netconsole_target *nt; const char *tmp; if (oops_only && !oops_in_progress) return; /* Avoid taking lock and disabling interrupts unnecessarily */ if (list_empty(&target_list)) return; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) { if (!nt->extended && nt->enabled && netif_running(nt->np.dev)) { /* * We nest this inside the for-each-target loop above * so that we're able to get as much logging out to * at least one target if we die inside here, instead * of unnecessarily keeping all targets in lock-step. */ tmp = msg; for (left = len; left;) { frag = min(left, MAX_PRINT_CHUNK); send_udp(nt, tmp, frag); tmp += frag; left -= frag; } } } spin_unlock_irqrestore(&target_list_lock, flags); } static int netconsole_parser_cmdline(struct netpoll *np, char *opt) { bool ipversion_set = false; char *cur = opt; char *delim; int ipv6; if (*cur != '@') { delim = strchr(cur, '@'); if (!delim) goto parse_failed; *delim = 0; if (kstrtou16(cur, 10, &np->local_port)) goto parse_failed; cur = delim; } cur++; if (*cur != '/') { ipversion_set = true; delim = strchr(cur, '/'); if (!delim) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); if (ipv6 < 0) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim; } cur++; if (*cur != ',') { /* parse out dev_name or dev_mac */ delim = strchr(cur, ','); if (!delim) goto parse_failed; *delim = 0; np->dev_name[0] = '\0'; eth_broadcast_addr(np->dev_mac); if (!strchr(cur, ':')) strscpy(np->dev_name, cur, sizeof(np->dev_name)); else if (!mac_pton(cur, np->dev_mac)) goto parse_failed; cur = delim; } cur++; if (*cur != '@') { /* dst port */ delim = strchr(cur, '@'); if (!delim) goto parse_failed; *delim = 0; if (*cur == ' ' || *cur == '\t') np_info(np, "warning: whitespace is not allowed\n"); if (kstrtou16(cur, 10, &np->remote_port)) goto parse_failed; cur = delim; } cur++; /* dst ip */ delim = strchr(cur, '/'); if (!delim) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); if (ipv6 < 0) goto parse_failed; else if (ipversion_set && np->ipv6 != (bool)ipv6) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim + 1; if (*cur != 0) { /* MAC address */ if (!mac_pton(cur, np->remote_mac)) goto parse_failed; } netconsole_print_banner(np); return 0; parse_failed: np_info(np, "couldn't parse config at '%s'!\n", cur); return -1; } /* Allocate new target (from boot/module param) and setup netpoll for it */ static struct netconsole_target *alloc_param_target(char *target_config, int cmdline_count) { struct netconsole_target *nt; int err; nt = alloc_and_init(); if (!nt) { err = -ENOMEM; goto fail; } if (*target_config == '+') { nt->extended = true; target_config++; } if (*target_config == 'r') { if (!nt->extended) { pr_err("Netconsole configuration error. Release feature requires extended log message"); err = -EINVAL; goto fail; } nt->release = true; target_config++; } /* Parse parameters and setup netpoll */ err = netconsole_parser_cmdline(&nt->np, target_config); if (err) goto fail; err = netpoll_setup(&nt->np); if (err) { pr_err("Not enabling netconsole for %s%d. Netpoll setup failed\n", NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count); if (!IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) /* only fail if dynamic reconfiguration is set, * otherwise, keep the target in the list, but disabled. */ goto fail; } else { nt->enabled = true; } populate_configfs_item(nt, cmdline_count); return nt; fail: kfree(nt); return ERR_PTR(err); } /* Cleanup netpoll for given target (from boot/module param) and free it */ static void free_param_target(struct netconsole_target *nt) { netpoll_cleanup(&nt->np); kfree(nt); } static struct console netconsole_ext = { .name = "netcon_ext", .flags = CON_ENABLED | CON_EXTENDED, .write = write_ext_msg, }; static struct console netconsole = { .name = "netcon", .flags = CON_ENABLED, .write = write_msg, }; static int __init init_netconsole(void) { int err; struct netconsole_target *nt, *tmp; u32 console_type_needed = 0; unsigned int count = 0; unsigned long flags; char *target_config; char *input = config; if (strnlen(input, MAX_PARAM_LENGTH)) { while ((target_config = strsep(&input, ";"))) { nt = alloc_param_target(target_config, count); if (IS_ERR(nt)) { if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) continue; err = PTR_ERR(nt); goto fail; } /* Dump existing printks when we register */ if (nt->extended) { console_type_needed |= CONS_EXTENDED; netconsole_ext.flags |= CON_PRINTBUFFER; } else { console_type_needed |= CONS_BASIC; netconsole.flags |= CON_PRINTBUFFER; } spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); count++; } } err = register_netdevice_notifier(&netconsole_netdev_notifier); if (err) goto fail; err = dynamic_netconsole_init(); if (err) goto undonotifier; if (console_type_needed & CONS_EXTENDED) register_console(&netconsole_ext); if (console_type_needed & CONS_BASIC) register_console(&netconsole); pr_info("network logging started\n"); return err; undonotifier: unregister_netdevice_notifier(&netconsole_netdev_notifier); fail: pr_err("cleaning up\n"); /* * Remove all targets and destroy them (only targets created * from the boot/module option exist here). Skipping the list * lock is safe here, and netpoll_cleanup() will sleep. */ list_for_each_entry_safe(nt, tmp, &target_list, list) { list_del(&nt->list); free_param_target(nt); } return err; } static void __exit cleanup_netconsole(void) { struct netconsole_target *nt, *tmp; if (console_is_registered(&netconsole_ext)) unregister_console(&netconsole_ext); if (console_is_registered(&netconsole)) unregister_console(&netconsole); dynamic_netconsole_exit(); unregister_netdevice_notifier(&netconsole_netdev_notifier); /* * Targets created via configfs pin references on our module * and would first be rmdir(2)'ed from userspace. We reach * here only when they are already destroyed, and only those * created from the boot/module option are left, so remove and * destroy them. Skipping the list lock is safe here, and * netpoll_cleanup() will sleep. */ list_for_each_entry_safe(nt, tmp, &target_list, list) { list_del(&nt->list); free_param_target(nt); } } /* * Use late_initcall to ensure netconsole is * initialized after network device driver if built-in. * * late_initcall() and module_init() are identical if built as module. */ late_initcall(init_netconsole); module_exit(cleanup_netconsole);
2 1 18 56 1 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #ifndef _CRYPTO_INTERNAL_CIPHER_H #define _CRYPTO_INTERNAL_CIPHER_H #include <crypto/algapi.h> struct crypto_cipher { struct crypto_tfm base; }; /** * DOC: Single Block Cipher API * * The single block cipher API is used with the ciphers of type * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto). * * Using the single block cipher API calls, operations with the basic cipher * primitive can be implemented. These cipher primitives exclude any block * chaining operations including IV handling. * * The purpose of this single block cipher API is to support the implementation * of templates or other concepts that only need to perform the cipher operation * on one block at a time. Templates invoke the underlying cipher primitive * block-wise and process either the input or the output data of these cipher * operations. */ static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm) { return (struct crypto_cipher *)tfm; } /** * crypto_alloc_cipher() - allocate single block cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * single block cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a single block cipher. The returned struct * crypto_cipher is the cipher handle that is required for any subsequent API * invocation for that single block cipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_CIPHER; mask |= CRYPTO_ALG_TYPE_MASK; return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask)); } static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm) { return &tfm->base; } /** * crypto_free_cipher() - zeroize and free the single block cipher handle * @tfm: cipher handle to be freed */ static inline void crypto_free_cipher(struct crypto_cipher *tfm) { crypto_free_tfm(crypto_cipher_tfm(tfm)); } /** * crypto_has_cipher() - Search for the availability of a single block cipher * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * single block cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Return: true when the single block cipher is known to the kernel crypto API; * false otherwise */ static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_CIPHER; mask |= CRYPTO_ALG_TYPE_MASK; return crypto_has_alg(alg_name, type, mask); } /** * crypto_cipher_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the single block cipher referenced with the cipher handle * tfm is returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm) { return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm)); } static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm) { return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm)); } static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm) { return crypto_tfm_get_flags(crypto_cipher_tfm(tfm)); } static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags); } static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags); } /** * crypto_cipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the single block cipher referenced by the * cipher handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_cipher_setkey(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen); /** * crypto_cipher_encrypt_one() - encrypt one block of plaintext * @tfm: cipher handle * @dst: points to the buffer that will be filled with the ciphertext * @src: buffer holding the plaintext to be encrypted * * Invoke the encryption operation of one block. The caller must ensure that * the plaintext and ciphertext buffers are at least one block in size. */ void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src); /** * crypto_cipher_decrypt_one() - decrypt one block of ciphertext * @tfm: cipher handle * @dst: points to the buffer that will be filled with the plaintext * @src: buffer holding the ciphertext to be decrypted * * Invoke the decryption operation of one block. The caller must ensure that * the plaintext and ciphertext buffers are at least one block in size. */ void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src); struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher); struct crypto_cipher_spawn { struct crypto_spawn base; }; static inline int crypto_grab_cipher(struct crypto_cipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_CIPHER; mask |= CRYPTO_ALG_TYPE_MASK; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } static inline void crypto_drop_cipher(struct crypto_cipher_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct crypto_alg *crypto_spawn_cipher_alg( struct crypto_cipher_spawn *spawn) { return spawn->base.alg; } static inline struct crypto_cipher *crypto_spawn_cipher( struct crypto_cipher_spawn *spawn) { u32 type = CRYPTO_ALG_TYPE_CIPHER; u32 mask = CRYPTO_ALG_TYPE_MASK; return __crypto_cipher_cast(crypto_spawn_tfm(&spawn->base, type, mask)); } static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm) { return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher; } #endif
115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 // SPDX-License-Identifier: GPL-2.0 #ifndef _LINUX_MM_SLOT_H #define _LINUX_MM_SLOT_H #include <linux/hashtable.h> #include <linux/slab.h> /* * struct mm_slot - hash lookup from mm to mm_slot * @hash: link to the mm_slots hash list * @mm_node: link into the mm_slots list * @mm: the mm that this information is valid for */ struct mm_slot { struct hlist_node hash; struct list_head mm_node; struct mm_struct *mm; }; #define mm_slot_entry(ptr, type, member) \ container_of(ptr, type, member) static inline void *mm_slot_alloc(struct kmem_cache *cache) { if (!cache) /* initialization failed */ return NULL; return kmem_cache_zalloc(cache, GFP_KERNEL); } static inline void mm_slot_free(struct kmem_cache *cache, void *objp) { kmem_cache_free(cache, objp); } #define mm_slot_lookup(_hashtable, _mm) \ ({ \ struct mm_slot *tmp_slot, *mm_slot = NULL; \ \ hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \ if (_mm == tmp_slot->mm) { \ mm_slot = tmp_slot; \ break; \ } \ \ mm_slot; \ }) #define mm_slot_insert(_hashtable, _mm, _mm_slot) \ ({ \ _mm_slot->mm = _mm; \ hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \ }) #endif /* _LINUX_MM_SLOT_H */
5 5 5 5 5 5 5 5 5 5 5 5 2 5 5 33 5 6 6 4 6 6 6 5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 // SPDX-License-Identifier: GPL-2.0+ /* * COMEDI ISA DMA support functions * Copyright (c) 2014 H Hartley Sweeten <hsweeten@visionengravers.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/delay.h> #include <linux/dma-mapping.h> #include <linux/isa-dma.h> #include <linux/comedi/comedidev.h> #include <linux/comedi/comedi_isadma.h> /** * comedi_isadma_program - program and enable an ISA DMA transfer * @desc: the ISA DMA cookie to program and enable */ void comedi_isadma_program(struct comedi_isadma_desc *desc) { unsigned long flags; flags = claim_dma_lock(); clear_dma_ff(desc->chan); set_dma_mode(desc->chan, desc->mode); set_dma_addr(desc->chan, desc->hw_addr); set_dma_count(desc->chan, desc->size); enable_dma(desc->chan); release_dma_lock(flags); } EXPORT_SYMBOL_GPL(comedi_isadma_program); /** * comedi_isadma_disable - disable the ISA DMA channel * @dma_chan: the DMA channel to disable * * Returns the residue (remaining bytes) left in the DMA transfer. */ unsigned int comedi_isadma_disable(unsigned int dma_chan) { unsigned long flags; unsigned int residue; flags = claim_dma_lock(); disable_dma(dma_chan); residue = get_dma_residue(dma_chan); release_dma_lock(flags); return residue; } EXPORT_SYMBOL_GPL(comedi_isadma_disable); /** * comedi_isadma_disable_on_sample - disable the ISA DMA channel * @dma_chan: the DMA channel to disable * @size: the sample size (in bytes) * * Returns the residue (remaining bytes) left in the DMA transfer. */ unsigned int comedi_isadma_disable_on_sample(unsigned int dma_chan, unsigned int size) { int stalled = 0; unsigned long flags; unsigned int residue; unsigned int new_residue; residue = comedi_isadma_disable(dma_chan); while (residue % size) { /* residue is a partial sample, enable DMA to allow more data */ flags = claim_dma_lock(); enable_dma(dma_chan); release_dma_lock(flags); udelay(2); new_residue = comedi_isadma_disable(dma_chan); /* is DMA stalled? */ if (new_residue == residue) { stalled++; if (stalled > 10) break; } else { residue = new_residue; stalled = 0; } } return residue; } EXPORT_SYMBOL_GPL(comedi_isadma_disable_on_sample); /** * comedi_isadma_poll - poll the current DMA transfer * @dma: the ISA DMA to poll * * Returns the position (in bytes) of the current DMA transfer. */ unsigned int comedi_isadma_poll(struct comedi_isadma *dma) { struct comedi_isadma_desc *desc = &dma->desc[dma->cur_dma]; unsigned long flags; unsigned int result; unsigned int result1; flags = claim_dma_lock(); clear_dma_ff(desc->chan); if (!isa_dma_bridge_buggy) disable_dma(desc->chan); result = get_dma_residue(desc->chan); /* * Read the counter again and choose higher value in order to * avoid reading during counter lower byte roll over if the * isa_dma_bridge_buggy is set. */ result1 = get_dma_residue(desc->chan); if (!isa_dma_bridge_buggy) enable_dma(desc->chan); release_dma_lock(flags); if (result < result1) result = result1; if (result >= desc->size || result == 0) return 0; return desc->size - result; } EXPORT_SYMBOL_GPL(comedi_isadma_poll); /** * comedi_isadma_set_mode - set the ISA DMA transfer direction * @desc: the ISA DMA cookie to set * @dma_dir: the DMA direction */ void comedi_isadma_set_mode(struct comedi_isadma_desc *desc, char dma_dir) { desc->mode = (dma_dir == COMEDI_ISADMA_READ) ? DMA_MODE_READ : DMA_MODE_WRITE; } EXPORT_SYMBOL_GPL(comedi_isadma_set_mode); /** * comedi_isadma_alloc - allocate and initialize the ISA DMA * @dev: comedi_device struct * @n_desc: the number of cookies to allocate * @dma_chan1: DMA channel for the first cookie * @dma_chan2: DMA channel for the second cookie * @maxsize: the size of the buffer to allocate for each cookie * @dma_dir: the DMA direction * * Returns the allocated and initialized ISA DMA or NULL if anything fails. */ struct comedi_isadma *comedi_isadma_alloc(struct comedi_device *dev, int n_desc, unsigned int dma_chan1, unsigned int dma_chan2, unsigned int maxsize, char dma_dir) { struct comedi_isadma *dma = NULL; struct comedi_isadma_desc *desc; unsigned int dma_chans[2]; int i; if (n_desc < 1 || n_desc > 2) goto no_dma; dma = kzalloc(sizeof(*dma), GFP_KERNEL); if (!dma) goto no_dma; desc = kcalloc(n_desc, sizeof(*desc), GFP_KERNEL); if (!desc) goto no_dma; dma->desc = desc; dma->n_desc = n_desc; if (dev->hw_dev) { dma->dev = dev->hw_dev; } else { /* Fall back to using the "class" device. */ if (!dev->class_dev) goto no_dma; /* Need 24-bit mask for ISA DMA. */ if (dma_coerce_mask_and_coherent(dev->class_dev, DMA_BIT_MASK(24))) { goto no_dma; } dma->dev = dev->class_dev; } dma_chans[0] = dma_chan1; if (dma_chan2 == 0 || dma_chan2 == dma_chan1) dma_chans[1] = dma_chan1; else dma_chans[1] = dma_chan2; if (request_dma(dma_chans[0], dev->board_name)) goto no_dma; dma->chan = dma_chans[0]; if (dma_chans[1] != dma_chans[0]) { if (request_dma(dma_chans[1], dev->board_name)) goto no_dma; } dma->chan2 = dma_chans[1]; for (i = 0; i < n_desc; i++) { desc = &dma->desc[i]; desc->chan = dma_chans[i]; desc->maxsize = maxsize; desc->virt_addr = dma_alloc_coherent(dma->dev, desc->maxsize, &desc->hw_addr, GFP_KERNEL); if (!desc->virt_addr) goto no_dma; comedi_isadma_set_mode(desc, dma_dir); } return dma; no_dma: comedi_isadma_free(dma); return NULL; } EXPORT_SYMBOL_GPL(comedi_isadma_alloc); /** * comedi_isadma_free - free the ISA DMA * @dma: the ISA DMA to free */ void comedi_isadma_free(struct comedi_isadma *dma) { struct comedi_isadma_desc *desc; int i; if (!dma) return; if (dma->desc) { for (i = 0; i < dma->n_desc; i++) { desc = &dma->desc[i]; if (desc->virt_addr) dma_free_coherent(dma->dev, desc->maxsize, desc->virt_addr, desc->hw_addr); } kfree(dma->desc); } if (dma->chan2 && dma->chan2 != dma->chan) free_dma(dma->chan2); if (dma->chan) free_dma(dma->chan); kfree(dma); } EXPORT_SYMBOL_GPL(comedi_isadma_free); static int __init comedi_isadma_init(void) { return 0; } module_init(comedi_isadma_init); static void __exit comedi_isadma_exit(void) { } module_exit(comedi_isadma_exit); MODULE_AUTHOR("H Hartley Sweeten <hsweeten@visionengravers.com>"); MODULE_DESCRIPTION("Comedi ISA DMA support"); MODULE_LICENSE("GPL");
1267 1269 1267 139 146 145 145 145 145 145 145 145 145 145 149 168 36 36 35 150 36 31 150 30 30 1 30 145 145 145 145 145 145 145 145 26 26 25 25 25 25 25 25 14 25 7 25 25 26 25 145 145 145 145 145 145 145 145 145 12 173 174 122 68 174 173 160 159 160 22 22 22 8 8 8 143 142 139 139 139 139 139 139 139 136 139 138 2 139 15 139 171 150 150 46 150 46 45 1 46 46 16 46 169 15 4 15 15 143 143 1 143 143 143 143 143 143 2 142 143 142 11 11 11 11 11 3 11 11 11 142 138 133 131 134 134 134 23 114 143 143 143 143 6 142 23 6 6 6 6 2 1 2 4 143 143 143 143 143 136 136 1 136 136 178 7 171 2 170 34 150 150 144 145 144 144 143 137 137 19 143 139 164 171 97 97 73 112 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/rseq.h> #include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> /* For vma exec functions. */ #include "../mm/internal.h" static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { /* If it's an anonymous inode make sure that we catch any shenanigans. */ VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) && !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC)); return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; struct vm_area_struct *vma = bprm->vma; struct mm_struct *mm = bprm->mm; int ret; /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * ahead of time. */ if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) return NULL; /* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm. */ ret = get_user_pages_remote(mm, pos, 1, write ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); #ifndef CONFIG_MMU bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); #else err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); if (err) goto err; #endif return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static inline int bprm_set_stack_limit(struct linux_binprm *bprm, unsigned long limit) { #ifdef CONFIG_MMU /* Avoid a pathological bprm->p. */ if (bprm->p < limit) return -E2BIG; bprm->argmin = bprm->p - limit; #endif return 0; } static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm) { #ifdef CONFIG_MMU return bprm->p < bprm->argmin; #else return false; #endif } /* * Calculate bprm->argmin from: * - _STK_LIM * - ARG_MAX * - bprm->rlim_stack.rlim_cur * - bprm->argc * - bprm->envc * - bprm->p */ static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* Reject totally pathological counts. */ if (bprm->argc < 0 || bprm->envc < 0) return -E2BIG; /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) || check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) return -E2BIG; if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; return bprm_set_stack_limit(bprm, limit); } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) goto out; while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { int ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; vm_flags_t vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ if (current->flags & PF_RANDOMIZE) stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; vma_iter_init(&vmi, mm, vma->vm_start); tlb_gather_mmu(&tlb, mm); ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. */ ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; stack_expand = min(rlim_stack, stack_size + stack_expand); #ifdef CONFIG_STACK_GROWSUP stack_base = vma->vm_start + stack_expand; #else stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap_local(src); if (ret) goto out; } bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ /* * On success, caller must call do_close_execat() on the returned * struct file to close it. */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { int err; struct file *file __free(fput) = NULL; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) return file; if (path_noexec(&file->f_path)) return ERR_PTR(-EACCES); /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode))) return ERR_PTR(-EACCES); err = exe_file_deny_write_access(file); if (err) return ERR_PTR(err); return no_free_ptr(file); } /** * open_exec - Open a path name for execution * * @name: path name to open with the intent of executing it. * * Returns ERR_PTR on failure or allocated struct file on success. * * As this is a wrapper for the internal do_open_execat(), callers * must call exe_file_allow_write_access() before fput() on release. Also see * do_close_execat(). */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec. */ ret = mmap_read_lock_killable(old_mm); if (ret) { up_write(&tsk->signal->exec_update_lock); return ret; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm, tsk); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop_lazy_tlb(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } /* * This is unlocked -- the string will always be NUL-terminated, but * may show overlapping contents if racing concurrent reads. */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { size_t len = min(strlen(buf), sizeof(tsk->comm) - 1); trace_task_rename(tsk, buf); memcpy(tsk->comm, buf, len); memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * This tracepoint marks the point before flushing the old exec where * the current task is still unchanged, but errors are fatal (point of * no return). The later "sched_process_exec" tracepoint is called after * the current task has successfully switched to the new exec. */ trace_sched_prepare_exec(current, bprm); /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* Make this the only thread in the thread group */ retval = de_thread(me); if (retval) goto out; /* see the comment in check_unsafe_exec() */ current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; retval = exec_task_namespaces(); if (retval) goto out_unlock; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); /* * If the original filename was empty, alloc_bprm() made up a path * that will probably not be useful to admins running ps or similar. * Let's fix it up to be something reasonable. */ if (bprm->comm_from_dentry) { /* * Hold RCU lock to keep the name from being freed behind our back. * Use acquire semantics to make sure the terminating NUL from * __d_alloc() is seen. * * Note, we're deliberately sloppy here. We don't need to care about * detecting a concurrent rename and just want a terminated name. */ rcu_read_lock(); __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), true); rcu_read_unlock(); } else { __set_task_comm(me, kbasename(bprm->filename), true); } /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct mnt_idmap *idmap = file_mnt_idmap(file); if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(&current->signal->cred_guard_mutex); return -ENOMEM; } /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { if (!file) return; exe_file_allow_write_access(file); fput(file); } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { /* in case exec fails before de_thread() succeeds */ current->fs->in_exec = 0; mutex_unlock(&current->signal->cred_guard_mutex); abort_creds(bprm->cred); } do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { struct linux_binprm *bprm; struct file *file; int retval = -ENOMEM; file = do_open_execat(fd, filename, flags); if (IS_ERR(file)) return ERR_CAST(file); bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { do_close_execat(file); return ERR_PTR(-ENOMEM); } bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); bprm->comm_from_dentry = 1; } else { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); } if (!bprm->fdpath) goto out_free; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; /* * At this point, security_file_open() has already been called (with * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will * stop just after the security_bprm_creds_for_exec() call in * bprm_execve(). Indeed, the kernel should not try to parse the * content of the file with exec_binprm() nor change the calling * thread, which means that the following security functions will not * be called: * - security_bprm_check() * - security_bprm_creds_from_file() * - security_bprm_committing_creds() * - security_bprm_committed_creds() */ bprm->is_check = !!(flags & AT_EXECVE_CHECK); retval = bprm_mm_init(bprm); if (!retval) return bprm; out_free: free_bprm(bprm); return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; /* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... * * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) * from another sub-thread until de_thread() succeeds, this * state is protected by cred_guard_mutex we hold. */ n_fs = 1; read_seqlock_excl(&p->fs->seq); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; read_sequnlock_excl(&p->fs->seq); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; int err; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); /* Did the exec bit vanish out from under us? Give up. */ if (err) return; /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) return -EFAULT; kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; return 0; } EXPORT_SYMBOL(remove_arg_zero); /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); return -ENOEXEC; } /* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; exe_file_allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } static int bprm_execve(struct linux_binprm *bprm) { int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; /* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated. */ check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current); sched_exec(); /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval || bprm->is_check) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; sched_mm_cid_after_execve(current); rseq_execve(current); /* execve succeeded */ current->in_execve = 0; user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); rseq_set_notify_resume(current); current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); } retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return -EINVAL; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; __mm_flags_set_mask_dumpable(mm, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif #ifdef CONFIG_SYSCTL static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error && !write) validate_coredump_safety(); return error; } static const struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); return 0; } fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */ #ifdef CONFIG_EXEC_KUNIT_TEST #include "tests/exec_kunit.c" #endif
19 19 19 19 19 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC Tx data buffering. * * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include "ar-internal.h" static atomic_t rxrpc_txbuf_debug_ids; atomic_t rxrpc_nr_txbuf; /* * Allocate and partially initialise a data transmission buffer. */ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp) { struct rxrpc_txbuf *txb; size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); void *buf; txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; /* We put a jumbo header in the buffer, but not a full wire header to * avoid delayed-corruption problems with zerocopy. */ doff = round_up(jsize, data_align); total = doff + data_size; data_align = umax(data_align, L1_CACHE_BYTES); mutex_lock(&call->conn->tx_data_alloc_lock); buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, data_align); mutex_unlock(&call->conn->tx_data_alloc_lock); if (!buf) { kfree(txb); return NULL; } refcount_set(&txb->ref, 1); txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); txb->alloc_size = data_size; txb->space = data_size; txb->offset = 0; txb->flags = call->conn->out_clientflag; txb->seq = call->send_top + 1; txb->data = buf + doff; trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, rxrpc_txbuf_alloc_data); atomic_inc(&rxrpc_nr_txbuf); return txb; } void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r = refcount_read(&txb->ref); trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r, what); } static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); if (txb->data) page_frag_free(txb->data); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { unsigned int debug_id, call_debug_id; rxrpc_seq_t seq; bool dead; int r; if (txb) { debug_id = txb->debug_id; call_debug_id = txb->call_debug_id; seq = txb->seq; dead = __refcount_dec_and_test(&txb->ref, &r); trace_rxrpc_txbuf(debug_id, call_debug_id, seq, r - 1, what); if (dead) rxrpc_free_txbuf(txb); } }
474 474 471 474 474 277 10 277 8 275 1 276 1 276 277 276 261 277 100 94 94 99 120 109 120 15 15 15 15 3 3 3 3 457 487 487 32 30 29 461 462 487 8 8 8 8 8 8 8 8 8 8 488 17 472 488 308 306 303 192 357 277 107 1 107 46 46 46 46 95 107 89 107 92 7 4 92 39 22 22 20 22 69 3 69 3 69 85 15 2 7 7 6 5 15 15 497 496 497 33 498 33 120 120 120 2 1 2 1 1 2 488 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This module provides the abstraction for an SCTP transport representing * a remote transport address. For local transport addresses, we just use * union sctp_addr. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/random.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* 1st Level Abstractions. */ /* Initialize a new transport from provided memory. */ static struct sctp_transport *sctp_transport_init(struct net *net, struct sctp_transport *peer, const union sctp_addr *addr, gfp_t gfp) { /* Copy in the address. */ peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); memcpy(&peer->ipaddr, addr, peer->af_specific->sockaddr_len); memset(&peer->saddr, 0, sizeof(union sctp_addr)); peer->sack_generation = 0; /* From 6.3.1 RTO Calculation: * * C1) Until an RTT measurement has been made for a packet sent to the * given destination transport address, set RTO to the protocol * parameter 'RTO.Initial'. */ peer->rto = msecs_to_jiffies(net->sctp.rto_initial); peer->last_time_heard = 0; peer->last_time_ecne_reduced = jiffies; peer->param_flags = SPP_HB_DISABLE | SPP_PMTUD_ENABLE | SPP_SACKDELAY_ENABLE; /* Initialize the default path max_retrans. */ peer->pathmaxrxt = net->sctp.max_retrans_path; peer->pf_retrans = net->sctp.pf_retrans; INIT_LIST_HEAD(&peer->transmitted); INIT_LIST_HEAD(&peer->send_ready); INIT_LIST_HEAD(&peer->transports); timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0); timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0); timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0); timer_setup(&peer->probe_timer, sctp_generate_probe_event, 0); timer_setup(&peer->proto_unreach_timer, sctp_generate_proto_unreach_event, 0); /* Initialize the 64-bit random nonce sent with heartbeat. */ get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); refcount_set(&peer->refcnt, 1); return peer; } /* Allocate and initialize a new transport. */ struct sctp_transport *sctp_transport_new(struct net *net, const union sctp_addr *addr, gfp_t gfp) { struct sctp_transport *transport; transport = kzalloc(sizeof(*transport), gfp); if (!transport) goto fail; if (!sctp_transport_init(net, transport, addr, gfp)) goto fail_init; SCTP_DBG_OBJCNT_INC(transport); return transport; fail_init: kfree(transport); fail: return NULL; } /* This transport is no longer needed. Free up if possible, or * delay until it last reference count. */ void sctp_transport_free(struct sctp_transport *transport) { transport->dead = 1; /* Try to delete the heartbeat timer. */ if (timer_delete(&transport->hb_timer)) sctp_transport_put(transport); /* Delete the T3_rtx timer if it's active. * There is no point in not doing this now and letting * structure hang around in memory since we know * the transport is going away. */ if (timer_delete(&transport->T3_rtx_timer)) sctp_transport_put(transport); if (timer_delete(&transport->reconf_timer)) sctp_transport_put(transport); if (timer_delete(&transport->probe_timer)) sctp_transport_put(transport); /* Delete the ICMP proto unreachable timer if it's active. */ if (timer_delete(&transport->proto_unreach_timer)) sctp_transport_put(transport); sctp_transport_put(transport); } static void sctp_transport_destroy_rcu(struct rcu_head *head) { struct sctp_transport *transport; transport = container_of(head, struct sctp_transport, rcu); dst_release(transport->dst); kfree(transport); SCTP_DBG_OBJCNT_DEC(transport); } /* Destroy the transport data structure. * Assumes there are no more users of this structure. */ static void sctp_transport_destroy(struct sctp_transport *transport) { if (unlikely(refcount_read(&transport->refcnt))) { WARN(1, "Attempt to destroy undead transport %p!\n", transport); return; } sctp_packet_free(&transport->packet); if (transport->asoc) sctp_association_put(transport->asoc); call_rcu(&transport->rcu, sctp_transport_destroy_rcu); } /* Start T3_rtx timer if it is not already running and update the heartbeat * timer. This routine is called every time a DATA chunk is sent. */ void sctp_transport_reset_t3_rtx(struct sctp_transport *transport) { /* RFC 2960 6.3.2 Retransmission Timer Rules * * R1) Every time a DATA chunk is sent to any address(including a * retransmission), if the T3-rtx timer of that address is not running * start it running so that it will expire after the RTO of that * address. */ if (!timer_pending(&transport->T3_rtx_timer)) if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } void sctp_transport_reset_hb_timer(struct sctp_transport *transport) { unsigned long expires; /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); if (!mod_timer(&transport->hb_timer, expires + get_random_u32_below(transport->rto))) sctp_transport_hold(transport); } void sctp_transport_reset_reconf_timer(struct sctp_transport *transport) { if (!timer_pending(&transport->reconf_timer)) if (!mod_timer(&transport->reconf_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } void sctp_transport_reset_probe_timer(struct sctp_transport *transport) { if (!mod_timer(&transport->probe_timer, jiffies + transport->probe_interval)) sctp_transport_hold(transport); } void sctp_transport_reset_raise_timer(struct sctp_transport *transport) { if (!mod_timer(&transport->probe_timer, jiffies + transport->probe_interval * 30)) sctp_transport_hold(transport); } /* This transport has been assigned to an association. * Initialize fields from the association or from the sock itself. * Register the reference count in the association. */ void sctp_transport_set_owner(struct sctp_transport *transport, struct sctp_association *asoc) { transport->asoc = asoc; sctp_association_hold(asoc); } /* Initialize the pmtu of a transport. */ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ if (!transport->dst || READ_ONCE(transport->dst->obsolete)) { sctp_transport_dst_release(transport); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); } if (transport->param_flags & SPP_PMTUD_DISABLE) { struct sctp_association *asoc = transport->asoc; if (!transport->pathmtu && asoc && asoc->pathmtu) transport->pathmtu = asoc->pathmtu; if (transport->pathmtu) return; } if (transport->dst) transport->pathmtu = sctp_dst_mtu(transport->dst); else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; sctp_transport_pl_update(transport); } void sctp_transport_pl_send(struct sctp_transport *t) { if (t->pl.probe_count < SCTP_MAX_PROBES) goto out; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } } else if (t->pl.state == SCTP_PL_SEARCH) { if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ t->pl.state = SCTP_PL_BASE; /* Search -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } else { /* Normal probe failure. */ t->pl.probe_high = t->pl.probe_size; t->pl.probe_size = t->pl.pmtu; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ t->pl.state = SCTP_PL_BASE; /* Search Complete -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } } out: pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); t->pl.probe_count++; } bool sctp_transport_pl_recv(struct sctp_transport *t) { pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); t->pl.pmtu = t->pl.probe_size; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { t->pl.state = SCTP_PL_SEARCH; /* Base -> Search */ t->pl.probe_size += SCTP_PL_BIG_STEP; } else if (t->pl.state == SCTP_PL_ERROR) { t->pl.state = SCTP_PL_SEARCH; /* Error -> Search */ t->pl.pmtu = t->pl.probe_size; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); t->pl.probe_size += SCTP_PL_BIG_STEP; } else if (t->pl.state == SCTP_PL_SEARCH) { if (!t->pl.probe_high) { if (t->pl.probe_size < SCTP_MAX_PLPMTU) { t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, SCTP_MAX_PLPMTU); return false; } t->pl.probe_high = SCTP_MAX_PLPMTU; } t->pl.probe_size += SCTP_PL_MIN_STEP; if (t->pl.probe_size >= t->pl.probe_high) { t->pl.probe_high = 0; t->pl.state = SCTP_PL_COMPLETE; /* Search -> Search Complete */ t->pl.probe_size = t->pl.pmtu; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); sctp_transport_reset_raise_timer(t); } } else if (t->pl.state == SCTP_PL_COMPLETE) { /* Raise probe_size again after 30 * interval in Search Complete */ t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_MIN_STEP, SCTP_MAX_PLPMTU); } return t->pl.state == SCTP_PL_COMPLETE; } static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) { pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, ptb: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, pmtu); if (pmtu < SCTP_MIN_PLPMTU || pmtu >= t->pl.probe_size) return false; if (t->pl.state == SCTP_PL_BASE) { if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) { t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); return true; } } else if (t->pl.state == SCTP_PL_SEARCH) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { t->pl.state = SCTP_PL_BASE; /* Search -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_count = 0; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); return true; } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) { t->pl.probe_size = pmtu; t->pl.probe_count = 0; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { t->pl.state = SCTP_PL_BASE; /* Complete -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_count = 0; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_transport_reset_probe_timer(t); return true; } } return false; } bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct sock *sk = t->asoc->base.sk; struct dst_entry *dst; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n", __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); /* Use default minimum segment instead */ pmtu = SCTP_DEFAULT_MINSEGMENT; } pmtu = SCTP_TRUNC4(pmtu); if (sctp_transport_pl_enabled(t)) return sctp_transport_pl_toobig(t, pmtu - sctp_transport_pl_hlen(t)); dst = sctp_transport_dst_check(t); if (dst) { struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); union sctp_addr addr; pf->af->from_sk(&addr, sk); pf->to_sk_daddr(&t->ipaddr, sk); dst->ops->update_pmtu(dst, sk, NULL, pmtu, true); pf->to_sk_daddr(&addr, sk); dst = sctp_transport_dst_check(t); } if (!dst) { t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); dst = t->dst; } if (dst) { /* Re-fetch, as under layers may have a higher minimum size */ pmtu = sctp_dst_mtu(dst); change = t->pathmtu != pmtu; } t->pathmtu = pmtu; return change; } /* Caches the dst entry and source address for a transport's destination * address. */ void sctp_transport_route(struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sock *opt) { struct sctp_association *asoc = transport->asoc; struct sctp_af *af = transport->af_specific; sctp_transport_dst_release(transport); af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt)); if (saddr) memcpy(&transport->saddr, saddr, sizeof(union sctp_addr)); else af->get_saddr(opt, transport, &transport->fl); sctp_transport_pmtu(transport, sctp_opt2sk(opt)); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). */ if (transport->dst && asoc && (!asoc->peer.primary_path || transport == asoc->peer.active_path)) opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk); } /* Hold a reference to a transport. */ int sctp_transport_hold(struct sctp_transport *transport) { return refcount_inc_not_zero(&transport->refcnt); } /* Release a reference to a transport and clean up * if there are no more references. */ void sctp_transport_put(struct sctp_transport *transport) { if (refcount_dec_and_test(&transport->refcnt)) sctp_transport_destroy(transport); } /* Update transport's RTO based on the newly calculated RTT. */ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) { if (unlikely(!tp->rto_pending)) /* We should not be doing any RTO updates unless rto_pending is set. */ pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp); if (tp->rttvar || tp->srtt) { struct net *net = tp->asoc->base.net; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' */ /* Note: The above algorithm has been rewritten to * express rto_beta and rto_alpha as inverse powers * of two. * For example, assuming the default value of RTO.Alpha of * 1/8, rto_alpha would be expressed as 3. */ tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta) + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha) + (rtt >> net->sctp.rto_alpha); } else { /* 6.3.1 C2) When the first RTT measurement R is made, set * SRTT <- R, RTTVAR <- R/2. */ tp->srtt = rtt; tp->rttvar = rtt >> 1; } /* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY. */ if (tp->rttvar == 0) tp->rttvar = SCTP_CLOCK_GRANULARITY; /* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */ tp->rto = tp->srtt + (tp->rttvar << 2); /* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min * seconds then it is rounded up to RTO.Min seconds. */ if (tp->rto < tp->asoc->rto_min) tp->rto = tp->asoc->rto_min; /* 6.3.1 C7) A maximum value may be placed on RTO provided it is * at least RTO.max seconds. */ if (tp->rto > tp->asoc->rto_max) tp->rto = tp->asoc->rto_max; sctp_max_rto(tp->asoc, tp); tp->rtt = rtt; /* Reset rto_pending so that a new RTT measurement is started when a * new data chunk is sent. */ tp->rto_pending = 0; pr_debug("%s: transport:%p, rtt:%d, srtt:%d rttvar:%d, rto:%ld\n", __func__, tp, rtt, tp->srtt, tp->rttvar, tp->rto); } /* This routine updates the transport's cwnd and partial_bytes_acked * parameters based on the bytes acked in the received SACK. */ void sctp_transport_raise_cwnd(struct sctp_transport *transport, __u32 sack_ctsn, __u32 bytes_acked) { struct sctp_association *asoc = transport->asoc; __u32 cwnd, ssthresh, flight_size, pba, pmtu; cwnd = transport->cwnd; flight_size = transport->flight_size; /* See if we need to exit Fast Recovery first */ if (asoc->fast_recovery && TSN_lte(asoc->fast_recovery_exit, sack_ctsn)) asoc->fast_recovery = 0; ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 4960 7.2.1 * o When cwnd is less than or equal to ssthresh, an SCTP * endpoint MUST use the slow-start algorithm to increase * cwnd only if the current congestion window is being fully * utilized, an incoming SACK advances the Cumulative TSN * Ack Point, and the data sender is not in Fast Recovery. * Only when these three conditions are met can the cwnd be * increased; otherwise, the cwnd MUST not be increased. * If these conditions are met, then cwnd MUST be increased * by, at most, the lesser of 1) the total size of the * previously outstanding DATA chunk(s) acknowledged, and * 2) the destination's path MTU. This upper bound protects * against the ACK-Splitting attack outlined in [SAVAGE99]. */ if (asoc->fast_recovery) return; /* The appropriate cwnd increase algorithm is performed * if, and only if the congestion window is being fully * utilized. Note that RFC4960 Errata 3.22 removed the * other condition on ctsn moving. */ if (flight_size < cwnd) return; if (bytes_acked > pmtu) cwnd += pmtu; else cwnd += bytes_acked; pr_debug("%s: slow start: transport:%p, bytes_acked:%d, " "cwnd:%d, ssthresh:%d, flight_size:%d, pba:%d\n", __func__, transport, bytes_acked, cwnd, ssthresh, flight_size, pba); } else { /* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh, * upon each SACK arrival, increase partial_bytes_acked * by the total number of bytes of all new chunks * acknowledged in that SACK including chunks * acknowledged by the new Cumulative TSN Ack and by Gap * Ack Blocks. (updated by RFC4960 Errata 3.22) * * When partial_bytes_acked is greater than cwnd and * before the arrival of the SACK the sender had less * bytes of data outstanding than cwnd (i.e., before * arrival of the SACK, flightsize was less than cwnd), * reset partial_bytes_acked to cwnd. (RFC 4960 Errata * 3.26) * * When partial_bytes_acked is equal to or greater than * cwnd and before the arrival of the SACK the sender * had cwnd or more bytes of data outstanding (i.e., * before arrival of the SACK, flightsize was greater * than or equal to cwnd), partial_bytes_acked is reset * to (partial_bytes_acked - cwnd). Next, cwnd is * increased by MTU. (RFC 4960 Errata 3.12) */ pba += bytes_acked; if (pba > cwnd && flight_size < cwnd) pba = cwnd; if (pba >= cwnd && flight_size >= cwnd) { pba = pba - cwnd; cwnd += pmtu; } pr_debug("%s: congestion avoidance: transport:%p, " "bytes_acked:%d, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, bytes_acked, cwnd, ssthresh, flight_size, pba); } transport->cwnd = cwnd; transport->partial_bytes_acked = pba; } /* This routine is used to lower the transport's cwnd when congestion is * detected. */ void sctp_transport_lower_cwnd(struct sctp_transport *transport, enum sctp_lower_cwnd reason) { struct sctp_association *asoc = transport->asoc; switch (reason) { case SCTP_LOWER_CWND_T3_RTX: /* RFC 2960 Section 7.2.3, sctpimpguide * When the T3-rtx timer expires on an address, SCTP should * perform slow start by: * ssthresh = max(cwnd/2, 4*MTU) * cwnd = 1*MTU * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = asoc->pathmtu; /* T3-rtx also clears fast recovery */ asoc->fast_recovery = 0; break; case SCTP_LOWER_CWND_FAST_RTX: /* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the * destination address(es) to which the missing DATA chunks * were last sent, according to the formula described in * Section 7.2.3. * * RFC 2960 7.2.3, sctpimpguide Upon detection of packet * losses from SACK (see Section 7.2.4), An endpoint * should do the following: * ssthresh = max(cwnd/2, 4*MTU) * cwnd = ssthresh * partial_bytes_acked = 0 */ if (asoc->fast_recovery) return; /* Mark Fast recovery */ asoc->fast_recovery = 1; asoc->fast_recovery_exit = asoc->next_tsn - 1; transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; break; case SCTP_LOWER_CWND_ECNE: /* RFC 2481 Section 6.1.2. * If the sender receives an ECN-Echo ACK packet * then the sender knows that congestion was encountered in the * network on the path from the sender to the receiver. The * indication of congestion should be treated just as a * congestion loss in non-ECN Capable TCP. That is, the TCP * source halves the congestion window "cwnd" and reduces the * slow start threshold "ssthresh". * A critical condition is that TCP does not react to * congestion indications more than once every window of * data (or more loosely more than once every round-trip time). */ if (time_after(jiffies, transport->last_time_ecne_reduced + transport->rtt)) { transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } break; case SCTP_LOWER_CWND_INACTIVE: /* RFC 2960 Section 7.2.1, sctpimpguide * When the endpoint does not transmit data on a given * transport address, the cwnd of the transport address * should be adjusted to max(cwnd/2, 4*MTU) per RTO. * NOTE: Although the draft recommends that this check needs * to be done every RTO interval, we do it every hearbeat * interval. */ transport->cwnd = max(transport->cwnd/2, 4*asoc->pathmtu); /* RFC 4960 Errata 3.27.2: also adjust sshthresh */ transport->ssthresh = transport->cwnd; break; } transport->partial_bytes_acked = 0; pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d\n", __func__, transport, reason, transport->cwnd, transport->ssthresh); } /* Apply Max.Burst limit to the congestion window: * sctpimpguide-05 2.14.2 * D) When the time comes for the sender to * transmit new DATA chunks, the protocol parameter Max.Burst MUST * first be applied to limit how many new DATA chunks may be sent. * The limit is applied by adjusting cwnd as follows: * if ((flightsize+ Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ void sctp_transport_burst_limited(struct sctp_transport *t) { struct sctp_association *asoc = t->asoc; u32 old_cwnd = t->cwnd; u32 max_burst_bytes; if (t->burst_limited || asoc->max_burst == 0) return; max_burst_bytes = t->flight_size + (asoc->max_burst * asoc->pathmtu); if (max_burst_bytes < old_cwnd) { t->cwnd = max_burst_bytes; t->burst_limited = old_cwnd; } } /* Restore the old cwnd congestion window, after the burst had it's * desired effect. */ void sctp_transport_burst_reset(struct sctp_transport *t) { if (t->burst_limited) { t->cwnd = t->burst_limited; t->burst_limited = 0; } } /* What is the next timeout value for this transport? */ unsigned long sctp_transport_timeout(struct sctp_transport *trans) { /* RTO + timer slack +/- 50% of RTO */ unsigned long timeout = trans->rto >> 1; if (trans->state != SCTP_UNCONFIRMED && trans->state != SCTP_PF) timeout += trans->hbinterval; return max_t(unsigned long, timeout, HZ / 5); } /* Reset transport variables to their initial values */ void sctp_transport_reset(struct sctp_transport *t) { struct sctp_association *asoc = t->asoc; /* RFC 2960 (bis), Section 5.2.4 * All the congestion control parameters (e.g., cwnd, ssthresh) * related to this peer MUST be reset to their initial values * (see Section 6.2.1) */ t->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); t->burst_limited = 0; t->ssthresh = asoc->peer.i.a_rwnd; t->rto = asoc->rto_initial; sctp_max_rto(asoc, t); t->rtt = 0; t->srtt = 0; t->rttvar = 0; /* Reset these additional variables so that we have a clean slate. */ t->partial_bytes_acked = 0; t->flight_size = 0; t->error_count = 0; t->rto_pending = 0; t->hb_sent = 0; /* Initialize the state information for SFR-CACC */ t->cacc.changeover_active = 0; t->cacc.cycling_changeover = 0; t->cacc.next_tsn_at_change = 0; t->cacc.cacc_saw_newack = 0; } /* Schedule retransmission on the given transport */ void sctp_transport_immediate_rtx(struct sctp_transport *t) { /* Stop pending T3_rtx_timer */ if (timer_delete(&t->T3_rtx_timer)) sctp_transport_put(t); sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX); if (!timer_pending(&t->T3_rtx_timer)) { if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto)) sctp_transport_hold(t); } } /* Drop dst */ void sctp_transport_dst_release(struct sctp_transport *t) { dst_release(t->dst); t->dst = NULL; t->dst_pending_confirm = 0; } /* Schedule neighbour confirm */ void sctp_transport_dst_confirm(struct sctp_transport *t) { t->dst_pending_confirm = 1; }
10 10 2 2 2 1 2 2 1 1 1 1 1 2 69 68 69 19 19 8 69 10 10 72 71 71 3 3 2 2 72 72 72 60 70 72 72 72 73 24 72 70 72 23 10 1 10 10 10 7 10 1 10 71 72 64 64 64 93 94 94 94 94 94 94 29 29 29 29 1 29 29 29 42 42 42 25 25 25 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Memory Manager * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> * 2000 by Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/mm.h> #include <sound/core.h> #include <sound/seq_kernel.h> #include "seq_memory.h" #include "seq_queue.h" #include "seq_info.h" #include "seq_lock.h" static inline int snd_seq_pool_available(struct snd_seq_pool *pool) { return pool->total_elements - atomic_read(&pool->counter); } static inline int snd_seq_output_ok(struct snd_seq_pool *pool) { return snd_seq_pool_available(pool) >= pool->room; } /* * Variable length event: * The event like sysex uses variable length type. * The external data may be stored in three different formats. * 1) kernel space * This is the normal case. * ext.data.len = length * ext.data.ptr = buffer pointer * 2) user space * When an event is generated via read(), the external data is * kept in user space until expanded. * ext.data.len = length | SNDRV_SEQ_EXT_USRPTR * ext.data.ptr = userspace pointer * 3) chained cells * When the variable length event is enqueued (in prioq or fifo), * the external data is decomposed to several cells. * ext.data.len = length | SNDRV_SEQ_EXT_CHAINED * ext.data.ptr = the additiona cell head * -> cell.next -> cell.next -> .. */ /* * exported: * call dump function to expand external data. */ static int get_var_len(const struct snd_seq_event *event) { if ((event->flags & SNDRV_SEQ_EVENT_LENGTH_MASK) != SNDRV_SEQ_EVENT_LENGTH_VARIABLE) return -EINVAL; return event->data.ext.len & ~SNDRV_SEQ_EXT_MASK; } static int dump_var_event(const struct snd_seq_event *event, snd_seq_dump_func_t func, void *private_data, int offset, int maxlen) { int len, err; struct snd_seq_event_cell *cell; len = get_var_len(event); if (len <= 0) return len; if (len <= offset) return 0; if (maxlen && len > offset + maxlen) len = offset + maxlen; if (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR) { char buf[32]; char __user *curptr = (char __force __user *)event->data.ext.ptr; curptr += offset; len -= offset; while (len > 0) { int size = sizeof(buf); if (len < size) size = len; if (copy_from_user(buf, curptr, size)) return -EFAULT; err = func(private_data, buf, size); if (err < 0) return err; curptr += size; len -= size; } return 0; } if (!(event->data.ext.len & SNDRV_SEQ_EXT_CHAINED)) return func(private_data, event->data.ext.ptr + offset, len - offset); cell = (struct snd_seq_event_cell *)event->data.ext.ptr; for (; len > 0 && cell; cell = cell->next) { int size = sizeof(struct snd_seq_event); char *curptr = (char *)&cell->event; if (offset >= size) { offset -= size; len -= size; continue; } if (len < size) size = len; err = func(private_data, curptr + offset, size - offset); if (err < 0) return err; offset = 0; len -= size; } return 0; } int snd_seq_dump_var_event(const struct snd_seq_event *event, snd_seq_dump_func_t func, void *private_data) { return dump_var_event(event, func, private_data, 0, 0); } EXPORT_SYMBOL(snd_seq_dump_var_event); /* * exported: * expand the variable length event to linear buffer space. */ static int seq_copy_in_kernel(void *ptr, void *src, int size) { char **bufptr = ptr; memcpy(*bufptr, src, size); *bufptr += size; return 0; } static int seq_copy_in_user(void *ptr, void *src, int size) { char __user **bufptr = ptr; if (copy_to_user(*bufptr, src, size)) return -EFAULT; *bufptr += size; return 0; } static int expand_var_event(const struct snd_seq_event *event, int offset, int size, char *buf, bool in_kernel) { if (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR) { if (! in_kernel) return -EINVAL; if (copy_from_user(buf, (char __force __user *)event->data.ext.ptr + offset, size)) return -EFAULT; return 0; } return dump_var_event(event, in_kernel ? seq_copy_in_kernel : seq_copy_in_user, &buf, offset, size); } int snd_seq_expand_var_event(const struct snd_seq_event *event, int count, char *buf, int in_kernel, int size_aligned) { int len, newlen, err; len = get_var_len(event); if (len < 0) return len; newlen = len; if (size_aligned > 0) newlen = roundup(len, size_aligned); if (count < newlen) return -EAGAIN; err = expand_var_event(event, 0, len, buf, in_kernel); if (err < 0) return err; if (len != newlen) { if (in_kernel) memset(buf + len, 0, newlen - len); else if (clear_user((__force void __user *)buf + len, newlen - len)) return -EFAULT; } return newlen; } EXPORT_SYMBOL(snd_seq_expand_var_event); int snd_seq_expand_var_event_at(const struct snd_seq_event *event, int count, char *buf, int offset) { int len, err; len = get_var_len(event); if (len < 0) return len; if (len <= offset) return 0; len -= offset; if (len > count) len = count; err = expand_var_event(event, offset, count, buf, true); if (err < 0) return err; return len; } EXPORT_SYMBOL_GPL(snd_seq_expand_var_event_at); /* * release this cell, free extended data if available */ static inline void free_cell(struct snd_seq_pool *pool, struct snd_seq_event_cell *cell) { cell->next = pool->free; pool->free = cell; atomic_dec(&pool->counter); } void snd_seq_cell_free(struct snd_seq_event_cell * cell) { struct snd_seq_pool *pool; if (snd_BUG_ON(!cell)) return; pool = cell->pool; if (snd_BUG_ON(!pool)) return; guard(spinlock_irqsave)(&pool->lock); free_cell(pool, cell); if (snd_seq_ev_is_variable(&cell->event)) { if (cell->event.data.ext.len & SNDRV_SEQ_EXT_CHAINED) { struct snd_seq_event_cell *curp, *nextptr; curp = cell->event.data.ext.ptr; for (; curp; curp = nextptr) { nextptr = curp->next; curp->next = pool->free; free_cell(pool, curp); } } } if (waitqueue_active(&pool->output_sleep)) { /* has enough space now? */ if (snd_seq_output_ok(pool)) wake_up(&pool->output_sleep); } } /* * allocate an event cell. */ static int snd_seq_cell_alloc(struct snd_seq_pool *pool, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp) { struct snd_seq_event_cell *cell; unsigned long flags; int err = -EAGAIN; wait_queue_entry_t wait; if (pool == NULL) return -EINVAL; *cellp = NULL; init_waitqueue_entry(&wait, current); spin_lock_irqsave(&pool->lock, flags); if (pool->ptr == NULL) { /* not initialized */ pr_debug("ALSA: seq: pool is not initialized\n"); err = -EINVAL; goto __error; } while (pool->free == NULL && ! nonblock && ! pool->closing) { set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&pool->output_sleep, &wait); spin_unlock_irqrestore(&pool->lock, flags); if (mutexp) mutex_unlock(mutexp); schedule(); if (mutexp) mutex_lock(mutexp); spin_lock_irqsave(&pool->lock, flags); remove_wait_queue(&pool->output_sleep, &wait); /* interrupted? */ if (signal_pending(current)) { err = -ERESTARTSYS; goto __error; } } if (pool->closing) { /* closing.. */ err = -ENOMEM; goto __error; } cell = pool->free; if (cell) { int used; pool->free = cell->next; atomic_inc(&pool->counter); used = atomic_read(&pool->counter); if (pool->max_used < used) pool->max_used = used; pool->event_alloc_success++; /* clear cell pointers */ cell->next = NULL; err = 0; } else pool->event_alloc_failures++; *cellp = cell; __error: spin_unlock_irqrestore(&pool->lock, flags); return err; } /* * duplicate the event to a cell. * if the event has external data, the data is decomposed to additional * cells. */ int snd_seq_event_dup(struct snd_seq_pool *pool, struct snd_seq_event *event, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp) { int ncells, err; unsigned int extlen; struct snd_seq_event_cell *cell; int size; *cellp = NULL; ncells = 0; extlen = 0; if (snd_seq_ev_is_variable(event)) { extlen = event->data.ext.len & ~SNDRV_SEQ_EXT_MASK; ncells = DIV_ROUND_UP(extlen, sizeof(struct snd_seq_event)); } if (ncells >= pool->total_elements) return -ENOMEM; err = snd_seq_cell_alloc(pool, &cell, nonblock, file, mutexp); if (err < 0) return err; /* copy the event */ size = snd_seq_event_packet_size(event); memcpy(&cell->ump, event, size); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (size < sizeof(cell->event)) cell->ump.raw.extra = 0; #endif /* decompose */ if (snd_seq_ev_is_variable(event)) { int len = extlen; int is_chained = event->data.ext.len & SNDRV_SEQ_EXT_CHAINED; int is_usrptr = event->data.ext.len & SNDRV_SEQ_EXT_USRPTR; struct snd_seq_event_cell *src, *tmp, *tail; char *buf; cell->event.data.ext.len = extlen | SNDRV_SEQ_EXT_CHAINED; cell->event.data.ext.ptr = NULL; src = (struct snd_seq_event_cell *)event->data.ext.ptr; buf = (char *)event->data.ext.ptr; tail = NULL; while (ncells-- > 0) { size = sizeof(struct snd_seq_event); if (len < size) size = len; err = snd_seq_cell_alloc(pool, &tmp, nonblock, file, mutexp); if (err < 0) goto __error; if (cell->event.data.ext.ptr == NULL) cell->event.data.ext.ptr = tmp; if (tail) tail->next = tmp; tail = tmp; /* copy chunk */ if (is_chained && src) { tmp->event = src->event; src = src->next; } else if (is_usrptr) { if (copy_from_user(&tmp->event, (char __force __user *)buf, size)) { err = -EFAULT; goto __error; } } else { memcpy(&tmp->event, buf, size); } buf += size; len -= size; } } *cellp = cell; return 0; __error: snd_seq_cell_free(cell); return err; } /* poll wait */ int snd_seq_pool_poll_wait(struct snd_seq_pool *pool, struct file *file, poll_table *wait) { poll_wait(file, &pool->output_sleep, wait); guard(spinlock_irq)(&pool->lock); return snd_seq_output_ok(pool); } /* allocate room specified number of events */ int snd_seq_pool_init(struct snd_seq_pool *pool) { int cell; struct snd_seq_event_cell *cellptr; if (snd_BUG_ON(!pool)) return -EINVAL; cellptr = kvmalloc_array(pool->size, sizeof(struct snd_seq_event_cell), GFP_KERNEL); if (!cellptr) return -ENOMEM; /* add new cells to the free cell list */ guard(spinlock_irq)(&pool->lock); if (pool->ptr) { kvfree(cellptr); return 0; } pool->ptr = cellptr; pool->free = NULL; for (cell = 0; cell < pool->size; cell++) { cellptr = pool->ptr + cell; cellptr->pool = pool; cellptr->next = pool->free; pool->free = cellptr; } pool->room = (pool->size + 1) / 2; /* init statistics */ pool->max_used = 0; pool->total_elements = pool->size; return 0; } /* refuse the further insertion to the pool */ void snd_seq_pool_mark_closing(struct snd_seq_pool *pool) { if (snd_BUG_ON(!pool)) return; guard(spinlock_irqsave)(&pool->lock); pool->closing = 1; } /* remove events */ int snd_seq_pool_done(struct snd_seq_pool *pool) { struct snd_seq_event_cell *ptr; if (snd_BUG_ON(!pool)) return -EINVAL; /* wait for closing all threads */ if (waitqueue_active(&pool->output_sleep)) wake_up(&pool->output_sleep); while (atomic_read(&pool->counter) > 0) schedule_timeout_uninterruptible(1); /* release all resources */ scoped_guard(spinlock_irq, &pool->lock) { ptr = pool->ptr; pool->ptr = NULL; pool->free = NULL; pool->total_elements = 0; } kvfree(ptr); guard(spinlock_irq)(&pool->lock); pool->closing = 0; return 0; } /* init new memory pool */ struct snd_seq_pool *snd_seq_pool_new(int poolsize) { struct snd_seq_pool *pool; /* create pool block */ pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; spin_lock_init(&pool->lock); pool->ptr = NULL; pool->free = NULL; pool->total_elements = 0; atomic_set(&pool->counter, 0); pool->closing = 0; init_waitqueue_head(&pool->output_sleep); pool->size = poolsize; /* init statistics */ pool->max_used = 0; return pool; } /* remove memory pool */ int snd_seq_pool_delete(struct snd_seq_pool **ppool) { struct snd_seq_pool *pool = *ppool; *ppool = NULL; if (pool == NULL) return 0; snd_seq_pool_mark_closing(pool); snd_seq_pool_done(pool); kfree(pool); return 0; } /* exported to seq_clientmgr.c */ void snd_seq_info_pool(struct snd_info_buffer *buffer, struct snd_seq_pool *pool, char *space) { if (pool == NULL) return; snd_iprintf(buffer, "%sPool size : %d\n", space, pool->total_elements); snd_iprintf(buffer, "%sCells in use : %d\n", space, atomic_read(&pool->counter)); snd_iprintf(buffer, "%sPeak cells in use : %d\n", space, pool->max_used); snd_iprintf(buffer, "%sAlloc success : %d\n", space, pool->event_alloc_success); snd_iprintf(buffer, "%sAlloc failures : %d\n", space, pool->event_alloc_failures); }
70 11597 12394 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_H #define _ASM_X86_PAGE_H #include <linux/types.h> #ifdef __KERNEL__ #include <asm/page_types.h> #ifdef CONFIG_X86_64 #include <asm/page_64.h> #else #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLER__ struct page; #include <linux/range.h> extern struct range pfn_mapped[]; extern int nr_pfn_mapped; static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); } static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { copy_page(to, from); } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) #endif #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ /* * We need __phys_reloc_hide() here because gcc may assume that there is no * overflow during __pa() calculation and can optimize it unexpectedly. * Newer versions of gcc provide -fno-strict-overflow switch to handle this * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) /* * virt_to_page(kaddr) returns a valid pointer if and only if * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) static __always_inline void *pfn_to_kaddr(unsigned long pfn) { return __va(pfn << PAGE_SHIFT); } static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) { return __canonical_address(vaddr, vaddr_bits) == vaddr; } #endif /* __ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */
15 15 15 15 15 15 15 15 15 15 2 2 2 1 2 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-kthread-out.h - video/vbi output thread support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/font.h> #include <linux/mutex.h> #include <linux/videodev2.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/random.h> #include <linux/v4l2-dv-timings.h> #include <linux/jiffies.h> #include <asm/div64.h> #include <media/videobuf2-vmalloc.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-fh.h> #include <media/v4l2-event.h> #include "vivid-core.h" #include "vivid-vid-common.h" #include "vivid-vid-cap.h" #include "vivid-vid-out.h" #include "vivid-radio-common.h" #include "vivid-radio-rx.h" #include "vivid-radio-tx.h" #include "vivid-sdr-cap.h" #include "vivid-vbi-cap.h" #include "vivid-vbi-out.h" #include "vivid-osd.h" #include "vivid-ctrls.h" #include "vivid-kthread-out.h" #include "vivid-meta-out.h" static void vivid_thread_vid_out_tick(struct vivid_dev *dev) { struct vivid_buffer *vid_out_buf = NULL; struct vivid_buffer *vbi_out_buf = NULL; struct vivid_buffer *meta_out_buf = NULL; dprintk(dev, 1, "Video Output Thread Tick\n"); /* Drop a certain percentage of buffers. */ if (dev->perc_dropped_buffers && get_random_u32_below(100) < dev->perc_dropped_buffers) return; spin_lock(&dev->slock); /* * Only dequeue buffer if there is at least one more pending. * This makes video loopback possible. */ if (!list_empty(&dev->vid_out_active) && !list_is_singular(&dev->vid_out_active)) { vid_out_buf = list_entry(dev->vid_out_active.next, struct vivid_buffer, list); list_del(&vid_out_buf->list); } if (!list_empty(&dev->vbi_out_active) && (dev->field_out != V4L2_FIELD_ALTERNATE || (dev->vbi_out_seq_count & 1))) { vbi_out_buf = list_entry(dev->vbi_out_active.next, struct vivid_buffer, list); list_del(&vbi_out_buf->list); } if (!list_empty(&dev->meta_out_active)) { meta_out_buf = list_entry(dev->meta_out_active.next, struct vivid_buffer, list); list_del(&meta_out_buf->list); } spin_unlock(&dev->slock); if (!vid_out_buf && !vbi_out_buf && !meta_out_buf) return; if (vid_out_buf) { v4l2_ctrl_request_setup(vid_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); v4l2_ctrl_request_complete(vid_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); vid_out_buf->vb.sequence = dev->vid_out_seq_count; if (dev->field_out == V4L2_FIELD_ALTERNATE) { /* * The sequence counter counts frames, not fields. * So divide by two. */ vid_out_buf->vb.sequence /= 2; } vid_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&vid_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vid_out buffer %d done\n", vid_out_buf->vb.vb2_buf.index); } if (vbi_out_buf) { v4l2_ctrl_request_setup(vbi_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); v4l2_ctrl_request_complete(vbi_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); if (dev->stream_sliced_vbi_out) vivid_sliced_vbi_out_process(dev, vbi_out_buf); vbi_out_buf->vb.sequence = dev->vbi_out_seq_count; vbi_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&vbi_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vbi_out buffer %d done\n", vbi_out_buf->vb.vb2_buf.index); } if (meta_out_buf) { v4l2_ctrl_request_setup(meta_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); v4l2_ctrl_request_complete(meta_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); vivid_meta_out_process(dev, meta_out_buf); meta_out_buf->vb.sequence = dev->meta_out_seq_count; meta_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&meta_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "meta_out buffer %d done\n", meta_out_buf->vb.vb2_buf.index); } dev->dqbuf_error = false; } static int vivid_thread_vid_out(void *data) { struct vivid_dev *dev = data; u64 numerators_since_start; u64 buffers_since_start; u64 next_jiffies_since_start; unsigned long jiffies_since_start; unsigned long cur_jiffies; unsigned wait_jiffies; unsigned numerator; unsigned denominator; dprintk(dev, 1, "Video Output Thread Start\n"); set_freezable(); /* Resets frame counters */ dev->out_seq_offset = 0; dev->out_seq_count = 0; dev->jiffies_vid_out = jiffies; dev->out_seq_resync = false; if (dev->time_wrap) dev->time_wrap_offset = dev->time_wrap - ktime_get_ns(); else dev->time_wrap_offset = 0; for (;;) { try_to_freeze(); if (kthread_should_stop()) break; if (!mutex_trylock(&dev->mutex)) { schedule(); continue; } cur_jiffies = jiffies; if (dev->out_seq_resync) { dev->jiffies_vid_out = cur_jiffies; dev->out_seq_offset = dev->out_seq_count + 1; dev->out_seq_count = 0; dev->out_seq_resync = false; } numerator = dev->timeperframe_vid_out.numerator; denominator = dev->timeperframe_vid_out.denominator; if (dev->field_out == V4L2_FIELD_ALTERNATE) denominator *= 2; /* Calculate the number of jiffies since we started streaming */ jiffies_since_start = cur_jiffies - dev->jiffies_vid_out; /* Get the number of buffers streamed since the start */ buffers_since_start = (u64)jiffies_since_start * denominator + (HZ * numerator) / 2; do_div(buffers_since_start, HZ * numerator); /* * After more than 0xf0000000 (rounded down to a multiple of * 'jiffies-per-day' to ease jiffies_to_msecs calculation) * jiffies have passed since we started streaming reset the * counters and keep track of the sequence offset. */ if (jiffies_since_start > JIFFIES_RESYNC) { dev->jiffies_vid_out = cur_jiffies; dev->out_seq_offset = buffers_since_start; buffers_since_start = 0; } dev->out_seq_count = buffers_since_start + dev->out_seq_offset; dev->vid_out_seq_count = dev->out_seq_count - dev->vid_out_seq_start; dev->vbi_out_seq_count = dev->out_seq_count - dev->vbi_out_seq_start; dev->meta_out_seq_count = dev->out_seq_count - dev->meta_out_seq_start; vivid_thread_vid_out_tick(dev); mutex_unlock(&dev->mutex); /* * Calculate the number of 'numerators' streamed since we started, * not including the current buffer. */ numerators_since_start = buffers_since_start * numerator; /* And the number of jiffies since we started */ jiffies_since_start = jiffies - dev->jiffies_vid_out; /* Increase by the 'numerator' of one buffer */ numerators_since_start += numerator; /* * Calculate when that next buffer is supposed to start * in jiffies since we started streaming. */ next_jiffies_since_start = numerators_since_start * HZ + denominator / 2; do_div(next_jiffies_since_start, denominator); /* If it is in the past, then just schedule asap */ if (next_jiffies_since_start < jiffies_since_start) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; if (!time_is_after_jiffies(cur_jiffies + wait_jiffies)) continue; wait_queue_head_t wait; init_waitqueue_head(&wait); wait_event_interruptible_timeout(wait, kthread_should_stop(), cur_jiffies + wait_jiffies - jiffies); } dprintk(dev, 1, "Video Output Thread End\n"); return 0; } static void vivid_grab_controls(struct vivid_dev *dev, bool grab) { v4l2_ctrl_grab(dev->ctrl_has_crop_out, grab); v4l2_ctrl_grab(dev->ctrl_has_compose_out, grab); v4l2_ctrl_grab(dev->ctrl_has_scaler_out, grab); v4l2_ctrl_grab(dev->ctrl_tx_mode, grab); v4l2_ctrl_grab(dev->ctrl_tx_rgb_range, grab); } int vivid_start_generating_vid_out(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_out) { u32 seq_count = dev->out_seq_count + dev->seq_wrap * 128; if (pstreaming == &dev->vid_out_streaming) dev->vid_out_seq_start = seq_count; else if (pstreaming == &dev->vbi_out_streaming) dev->vbi_out_seq_start = seq_count; else dev->meta_out_seq_start = seq_count; *pstreaming = true; return 0; } /* Resets frame counters */ dev->jiffies_vid_out = jiffies; dev->vid_out_seq_start = dev->seq_wrap * 128; dev->vbi_out_seq_start = dev->seq_wrap * 128; dev->meta_out_seq_start = dev->seq_wrap * 128; dev->kthread_vid_out = kthread_run(vivid_thread_vid_out, dev, "%s-vid-out", dev->v4l2_dev.name); if (IS_ERR(dev->kthread_vid_out)) { int err = PTR_ERR(dev->kthread_vid_out); dev->kthread_vid_out = NULL; v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n"); return err; } *pstreaming = true; vivid_grab_controls(dev, true); dprintk(dev, 1, "returning from %s\n", __func__); return 0; } void vivid_stop_generating_vid_out(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_out == NULL) return; *pstreaming = false; if (pstreaming == &dev->vid_out_streaming) { /* Release all active buffers */ while (!list_empty(&dev->vid_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vid_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vid_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->vbi_out_streaming) { while (!list_empty(&dev->vbi_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vbi_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vbi_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->meta_out_streaming) { while (!list_empty(&dev->meta_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->meta_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "meta_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (dev->vid_out_streaming || dev->vbi_out_streaming || dev->meta_out_streaming) return; /* shutdown control thread */ vivid_grab_controls(dev, false); kthread_stop(dev->kthread_vid_out); dev->kthread_vid_out = NULL; }
3 1 3 2454 5 5 5 1 1 1 2 1 1 4 1 4 1 4 4 4 4 4 4 75 77 78 1 78 307 2448 725 727 4 4 4 5 7 7 7 359 359 359 359 2453 2456 2451 2448 2447 2451 2450 2446 8 6 8 5 2452 2450 2401 1179 1345 2404 2402 2402 2395 2398 2404 2326 153 162 1200 1198 1198 1196 1200 214 214 214 213 396 440 396 442 537 439 88 4 4 8 6 6 6 6 6 2 4 4 4 6 8 2470 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 // SPDX-License-Identifier: GPL-2.0 /* * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org> * * debugfs is for people to use instead of /proc or /sys. * See ./Documentation/core-api/kernel-api.rst for more details. */ #define pr_fmt(fmt) "debugfs: " fmt #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/kobject.h> #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/security.h> #include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { ret = security_locked_down(LOCKDOWN_DEBUGFS); if (ret) return ret; } return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_dir_inode_operations = { .lookup = simple_lookup, .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_symlink_inode_operations = { .get_link = simple_get_link, .setattr = debugfs_setattr, }; static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); } return inode; } struct debugfs_fs_info { kuid_t uid; kgid_t gid; umode_t mode; /* Opt_* bitfield. */ unsigned int opts; }; enum { Opt_uid, Opt_gid, Opt_mode, Opt_source, }; static const struct fs_parameter_spec debugfs_param_specs[] = { fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), fsparam_uid ("uid", Opt_uid), fsparam_string ("source", Opt_source), {} }; static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct debugfs_fs_info *opts = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, debugfs_param_specs, param, &result); if (opt < 0) { /* * We might like to report bad mount options here; but * traditionally debugfs has ignored all mount options */ if (opt == -ENOPARAM) return 0; return opt; } switch (opt) { case Opt_uid: opts->uid = result.uid; break; case Opt_gid: opts->gid = result.gid; break; case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; case Opt_source: if (fc->source) return invalfc(fc, "Multiple sources specified"); fc->source = param->string; param->string = NULL; break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options */ } opts->opts |= BIT(opt); return 0; } static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); /* * On remount, only reset mode/uid/gid if they were provided as mount * options. */ if (!remount || fsi->opts & BIT(Opt_mode)) { inode->i_mode &= ~S_IALLUGO; inode->i_mode |= fsi->mode; } if (!remount || fsi->opts & BIT(Opt_uid)) inode->i_uid = fsi->uid; if (!remount || fsi->opts & BIT(Opt_gid)) inode->i_gid = fsi->gid; } static void debugfs_apply_options(struct super_block *sb) { _debugfs_apply_options(sb, false); } static void debugfs_apply_options_remount(struct super_block *sb) { _debugfs_apply_options(sb, true); } static int debugfs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct debugfs_fs_info *sb_opts = sb->s_fs_info; struct debugfs_fs_info *new_opts = fc->s_fs_info; if (!new_opts) return 0; sync_filesystem(sb); /* structure copy of new mount options to sb */ *sb_opts = *new_opts; debugfs_apply_options_remount(sb); return 0; } static int debugfs_show_options(struct seq_file *m, struct dentry *root) { struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, fsi->uid)); if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, fsi->gid)); if (fsi->mode != DEBUGFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", fsi->mode); return 0; } static struct kmem_cache *debugfs_inode_cachep __ro_after_init; static void init_once(void *foo) { struct debugfs_inode_info *info = foo; inode_init_once(&info->vfs_inode); } static struct inode *debugfs_alloc_inode(struct super_block *sb) { struct debugfs_inode_info *info; info = alloc_inode_sb(sb, debugfs_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } static void debugfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(debugfs_inode_cachep, DEBUGFS_I(inode)); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .show_options = debugfs_show_options, .alloc_inode = debugfs_alloc_inode, .free_inode = debugfs_free_inode, }; static void debugfs_release_dentry(struct dentry *dentry) { struct debugfs_fsdata *fsd = dentry->d_fsdata; if (fsd) { WARN_ON(!list_empty(&fsd->cancellations)); mutex_destroy(&fsd->cancellations_mtx); } kfree(fsd); } static struct vfsmount *debugfs_automount(struct path *path) { struct inode *inode = path->dentry->d_inode; return DEBUGFS_I(inode)->automount(path->dentry, inode->i_private); } static const struct dentry_operations debugfs_dops = { .d_release = debugfs_release_dentry, .d_automount = debugfs_automount, }; static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr debug_files[] = {{""}}; int err; err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); if (err) return err; sb->s_op = &debugfs_super_operations; set_default_d_op(sb, &debugfs_dops); sb->s_d_flags |= DCACHE_DONTCACHE; debugfs_apply_options(sb); return 0; } static int debugfs_get_tree(struct fs_context *fc) { int err; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return -EPERM; err = get_tree_single(fc, debugfs_fill_super); if (err) return err; return debugfs_reconfigure(fc); } static void debugfs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations debugfs_context_ops = { .free = debugfs_free_fc, .parse_param = debugfs_parse_param, .get_tree = debugfs_get_tree, .reconfigure = debugfs_reconfigure, }; static int debugfs_init_fs_context(struct fs_context *fc) { struct debugfs_fs_info *fsi; fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); if (!fsi) return -ENOMEM; fsi->mode = DEBUGFS_DEFAULT_MODE; fc->s_fs_info = fsi; fc->ops = &debugfs_context_ops; return 0; } static struct file_system_type debug_fs_type = { .owner = THIS_MODULE, .name = "debugfs", .init_fs_context = debugfs_init_fs_context, .parameters = debugfs_param_specs, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("debugfs"); /** * debugfs_lookup() - look up an existing debugfs file * @name: a pointer to a string containing the name of the file to look up. * @parent: a pointer to the parent dentry of the file. * * This function will return a pointer to a dentry if it succeeds. If the file * doesn't exist or an error occurs, %NULL will be returned. The returned * dentry must be passed to dput() when it is no longer needed. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) { struct dentry *dentry; if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent)) return NULL; if (!parent) parent = debugfs_mount->mnt_root; dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent); if (IS_ERR(dentry)) return NULL; return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); static struct dentry *debugfs_start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); if (!debugfs_initialized()) return ERR_PTR(-ENOENT); pr_debug("creating file '%s'\n", name); if (IS_ERR(parent)) return parent; error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) { pr_err("Unable to pin filesystem for file '%s'\n", name); return ERR_PTR(error); } /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ if (!parent) parent = debugfs_mount->mnt_root; dentry = simple_start_creating(parent, name); if (IS_ERR(dentry)) { if (dentry == ERR_PTR(-EEXIST)) pr_err("'%s' already exists in '%pd'\n", name, parent); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return ERR_PTR(-ENOMEM); } static struct dentry *end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } static struct dentry *__debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, const struct file_operations *proxy_fops, const void *real_fops) { struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", name); return failed_creating(dentry); } inode->i_mode = mode; inode->i_private = data; inode->i_op = &debugfs_file_inode_operations; if (!real_fops) proxy_fops = &debugfs_noop_file_operations; inode->i_fop = proxy_fops; DEBUGFS_I(inode)->raw = real_fops; DEBUGFS_I(inode)->aux = (void *)aux; d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } struct dentry *debugfs_create_file_full(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, aux, &debugfs_full_proxy_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_full); struct dentry *debugfs_create_file_short(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, const struct debugfs_short_fops *fops) { return __debugfs_create_file(name, mode, parent, data, aux, &debugfs_full_short_proxy_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_short); /** * debugfs_create_file_unsafe - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * debugfs_create_file_unsafe() is completely analogous to * debugfs_create_file(), the only difference being that the fops * handed it will not get protected against file removals by the * debugfs core. * * It is your responsibility to protect your struct file_operation * methods against file removals by means of debugfs_file_get() * and debugfs_file_put(). ->open() is still protected by * debugfs though. * * Any struct file_operations defined by means of * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and * thus, may be used here. */ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, NULL, &debugfs_open_proxy_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); /** * debugfs_create_file_size - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @file_size: initial file size * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) */ void debugfs_create_file_size(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, loff_t file_size) { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (!IS_ERR(de)) d_inode(de)->i_size = file_size; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); /** * debugfs_create_dir - create a directory in the debugfs filesystem * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the debugfs filesystem. * * This function creates a directory in debugfs with the given name. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", name); return failed_creating(dentry); } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); /** * debugfs_create_automount - create automount point in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @f: function to be called when pathname resolution steps on that one. * @data: opaque argument to pass to f(). * * @f should return what ->d_automount() would. */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, debugfs_automount_t f, void *data) { struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); return failed_creating(dentry); } make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; DEBUGFS_I(inode)->automount = f; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); /** * debugfs_create_symlink- create a symbolic link in the debugfs filesystem * @name: a pointer to a string containing the name of the symbolic link to * create. * @parent: a pointer to the parent dentry for this symbolic link. This * should be a directory dentry if set. If this parameter is NULL, * then the symbolic link will be created in the root of the debugfs * filesystem. * @target: a pointer to a string containing the path to the target of the * symbolic link. * * This function creates a symbolic link with the given name in debugfs that * links to the given target path. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *target) { struct dentry *dentry; struct inode *inode; char *link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create symlink '%s'\n", name); kfree(link); return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); static void __debugfs_file_removed(struct dentry *dentry) { struct debugfs_fsdata *fsd; /* * Paired with the closing smp_mb() implied by a successful * cmpxchg() in debugfs_file_get(): either * debugfs_file_get() must see a dead dentry or we must see a * debugfs_fsdata instance at ->d_fsdata here (or both). */ smp_mb(); fsd = READ_ONCE(dentry->d_fsdata); if (!fsd) return; /* if this was the last reference, we're done */ if (refcount_dec_and_test(&fsd->active_users)) return; /* * If there's still a reference, the code that obtained it can * be in different states: * - The common case of not using cancellations, or already * after debugfs_leave_cancellation(), where we just need * to wait for debugfs_file_put() which signals the completion; * - inside a cancellation section, i.e. between * debugfs_enter_cancellation() and debugfs_leave_cancellation(), * in which case we need to trigger the ->cancel() function, * and then wait for debugfs_file_put() just like in the * previous case; * - before debugfs_enter_cancellation() (but obviously after * debugfs_file_get()), in which case we may not see the * cancellation in the list on the first round of the loop, * but debugfs_enter_cancellation() signals the completion * after adding it, so this code gets woken up to call the * ->cancel() function. */ while (refcount_read(&fsd->active_users)) { struct debugfs_cancellation *c; /* * Lock the cancellations. Note that the cancellations * structs are meant to be on the stack, so we need to * ensure we either use them here or don't touch them, * and debugfs_leave_cancellation() will wait for this * to be finished processing before exiting one. It may * of course win and remove the cancellation, but then * chances are we never even got into this bit, we only * do if the refcount isn't zero already. */ mutex_lock(&fsd->cancellations_mtx); while ((c = list_first_entry_or_null(&fsd->cancellations, typeof(*c), list))) { list_del_init(&c->list); c->cancel(dentry, c->cancel_data); } mutex_unlock(&fsd->cancellations_mtx); wait_for_completion(&fsd->active_users_drained); } } static void remove_one(struct dentry *victim) { if (d_is_reg(victim)) __debugfs_file_removed(victim); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } /** * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * * This function recursively removes a directory tree in debugfs that * was previously created with a call to another debugfs function * (like debugfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ void debugfs_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it * @name: a pointer to a string containing the name of the item to look up. * @parent: a pointer to the parent dentry of the item. * * This is the equlivant of doing something like * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting * handled for the directory being looked up. */ void debugfs_lookup_and_remove(const char *name, struct dentry *parent) { struct dentry *dentry; dentry = debugfs_lookup(name, parent); if (!dentry) return; debugfs_remove(dentry); dput(dentry); } EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); /** * debugfs_change_name - rename a file/directory in the debugfs filesystem * @dentry: dentry of an object to be renamed. * @fmt: format for new name * * This function renames a file/directory in debugfs. The target must not * exist for rename to succeed. * * This function will return 0 on success and -E... on failure. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, ...) { int error = 0; const char *new_name; struct name_snapshot old_name; struct dentry *parent, *target; struct inode *dir; va_list ap; if (IS_ERR_OR_NULL(dentry)) return 0; va_start(ap, fmt); new_name = kvasprintf_const(GFP_KERNEL, fmt, ap); va_end(ap); if (!new_name) return -ENOMEM; parent = dget_parent(dentry); dir = d_inode(parent); inode_lock(dir); take_dentry_name_snapshot(&old_name, dentry); if (WARN_ON_ONCE(dentry->d_parent != parent)) { error = -EINVAL; goto out; } if (strcmp(old_name.name.name, new_name) == 0) goto out; target = lookup_noperm(&QSTR(new_name), parent); if (IS_ERR(target)) { error = PTR_ERR(target); goto out; } if (d_really_is_positive(target)) { dput(target); error = -EINVAL; goto out; } simple_rename_timestamp(dir, dentry, dir, target); d_move(dentry, target); dput(target); fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry); out: release_dentry_name_snapshot(&old_name); inode_unlock(dir); dput(parent); kfree_const(new_name); return error; } EXPORT_SYMBOL_GPL(debugfs_change_name); /** * debugfs_initialized - Tells whether debugfs has been registered */ bool debugfs_initialized(void) { return debugfs_registered; } EXPORT_SYMBOL_GPL(debugfs_initialized); static int __init debugfs_kernel(char *str) { if (str) { if (!strcmp(str, "on")) debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT; else if (!strcmp(str, "no-mount")) debugfs_allow = DEBUGFS_ALLOW_API; else if (!strcmp(str, "off")) debugfs_allow = 0; } return 0; } early_param("debugfs", debugfs_kernel); static int __init debugfs_init(void) { int retval; if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT)) return -EPERM; retval = sysfs_create_mount_point(kernel_kobj, "debug"); if (retval) return retval; debugfs_inode_cachep = kmem_cache_create("debugfs_inode_cache", sizeof(struct debugfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (debugfs_inode_cachep == NULL) { sysfs_remove_mount_point(kernel_kobj, "debug"); return -ENOMEM; } retval = register_filesystem(&debug_fs_type); if (retval) { // Really not going to happen sysfs_remove_mount_point(kernel_kobj, "debug"); kmem_cache_destroy(debugfs_inode_cachep); return retval; } debugfs_registered = true; return 0; } core_initcall(debugfs_init);
245 247 247 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H #include <asm/barrier.h> #include <linux/virtio.h> #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> /* * Barriers in virtio are tricky. Non-SMP virtio guests can't assume * they're not on an SMP host system, so they need to assume real * barriers. Non-SMP virtio hosts could skip the barriers, but does * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous * CPUs) we do need real barriers. In theory, we could be using both * kinds of virtio, so it's a runtime decision, and the branch is * actually quite cheap. */ static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) virt_mb(); else mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) virt_rmb(); else dma_rmb(); } static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) virt_wmb(); else dma_wmb(); } #define virtio_store_mb(weak_barriers, p, v) \ do { \ if (weak_barriers) { \ virt_store_mb(*p, v); \ } else { \ WRITE_ONCE(*p, v); \ mb(); \ } \ } while (0) \ struct virtio_device; struct virtqueue; struct device; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Creates a virtqueue and allocates the descriptor ring with per * virtqueue mapping operations. */ struct virtqueue *vring_create_virtqueue_map(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, union virtio_map map); /* * Creates a virtqueue with a standard layout but a caller-allocated * ring. */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Destroys a virtqueue. If created with vring_create_virtqueue, this * also frees the ring. */ void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */
1743 115 1745 8 1 7 7 18 18 18 18 12 11 1 10 2 2 2 8 6 2 5 4 5 1 1 6 12 12 18 642 14 227 7 8 6 740 722 740 1103 1781 1781 1781 1764 1781 3 2 1780 2 1780 642 1778 1781 1780 1781 22 1779 1771 2 1767 91 1770 1765 1102 1103 1101 1103 842 890 888 981 766 888 929 39 1095 1094 649 637 1095 1091 1091 1 714 654 654 11 10 9 56 1749 127 1119 127 96 1749 1626 1548 1184 1633 892 890 864 12 851 784 132 132 131 1604 1079 947 1633 641 641 7 641 1955 1955 670 1965 1964 1168 871 655 1512 223 3 499 1962 1956 1950 1955 1963 1362 1509 1510 1513 9505 9496 9498 822 237 822 12 9 9 9 6 6 7 2 215 26 34 37 38 13 38 37 36 36 36 35 3 34 28 6 34 34 30 30 30 30 30 6 6 6 5 6 4 4 2 25 30 18 32 13 201 201 201 201 201 201 201 201 201 200 201 201 62 201 201 201 201 1920 1914 272 5 3 1917 4 4 1924 219 1931 113 1912 457 26 4 4 18 145 145 8 1 8 8 1 7 341 341 341 341 340 341 341 341 341 91 91 341 340 341 341 341 341 340 3 341 341 18 341 340 341 30 340 341 341 341 341 341 341 1 91 91 91 91 1 1 92 92 92 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 // SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * * Written by obz. * * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/memfd.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #define CREATE_TRACE_POINTS #include <trace/events/mmap.h> #include "internal.h" #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { vm_flags_t vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check * @len: The size of increase. * * Return: 0 on success. */ static int check_brk_limits(unsigned long addr, unsigned long len) { unsigned long mapped_addr; mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate = false; LIST_HEAD(uf); struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; min_brk = mm->start_brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (!current->brk_randomized) min_brk = mm->end_data; #endif if (brk < min_brk) goto out; /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) { mm->brk = brk; goto success; } /* Always allow shrinking brk. */ if (brk <= mm->brk) { /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. * do_vmi_align_munmap() will drop the lock on success, so * update it before calling do_vma_munmap(). */ mm->brk = brk; if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, /* unlock = */ true)) goto out; goto success_unlocked; } if (check_brk_limits(oldbrk, newbrk - oldbrk)) goto out; /* * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ vma_iter_init(&vmi, mm, oldbrk); next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; if (mm->def_flags & VM_LOCKED) populate = true; success: mmap_write_unlock(mm); success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; } /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr */ static inline unsigned long round_hint_to_min(unsigned long hint) { hint &= PAGE_MASK; if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); return hint; } bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; locked_pages += mm->locked_vm; limit_pages = rlimit(RLIMIT_MEMLOCK); limit_pages >>= PAGE_SHIFT; return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) { if (S_ISREG(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISSOCK(inode->i_mode)) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ return ULONG_MAX; } static inline bool file_mmap_ok(struct file *file, struct inode *inode, unsigned long pgoff, unsigned long len) { u64 maxsize = file_mmap_size_max(file, inode); if (maxsize && len > maxsize) return false; maxsize -= len; if (pgoff > maxsize >> PAGE_SHIFT) return false; return true; } /** * do_mmap() - Perform a userland memory mapping into the current process * address space of length @len with protection bits @prot, mmap flags @flags * (from which VMA flags will be inferred), and any additional VMA flags to * apply @vm_flags. If this is a file-backed mapping then the file is specified * in @file and page offset into the file via @pgoff. * * This function does not perform security checks on the file and assumes, if * @uf is non-NULL, the caller has provided a list head to track unmap events * for userfaultfd @uf. * * It also simply indicates whether memory population is required by setting * @populate, which must be non-NULL, expecting the caller to actually perform * this task itself if appropriate. * * This function will invoke architecture-specific (and if provided and * relevant, file system-specific) logic to determine the most appropriate * unmapped area in which to place the mapping if not MAP_FIXED. * * Callers which require userland mmap() behaviour should invoke vm_mmap(), * which is also exported for module use. * * Those which require this behaviour less security checks, userfaultfd and * populate behaviour, and who handle the mmap write lock themselves, should * call this function. * * Note that the returned address may reside within a merged VMA if an * appropriate merge were to take place, so it doesn't necessarily specify the * start of a VMA, rather only the start of a valid mapped range of length * @len bytes, rounded down to the nearest page size. * * The caller must write-lock current->mm->mmap_lock. * * @file: An optional struct file pointer describing the file which is to be * mapped, if a file-backed mapping. * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the * address at which to perform this mapping. See mmap (2) for details. Must be * page-aligned. * @len: The length of the mapping. Will be page-aligned and must be at least 1 * page in size. * @prot: Protection bits describing access required to the mapping. See mmap * (2) for details. * @flags: Flags specifying how the mapping should be performed, see mmap (2) * for details. * @vm_flags: VMA flags which should be set by default, or 0 otherwise. * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. * @populate: A pointer to a value which will be set to 0 if no population of * the range is required, or the number of bytes to populate if it is. Must be * non-NULL. See mmap (2) for details as to under what circumstances population * of the range occurs. * @uf: An optional pointer to a list head to track userfaultfd unmap events * should unmapping events arise. If provided, it is up to the caller to manage * this. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; *populate = 0; mmap_assert_write_locked(mm); if (!len) return -EINVAL; /* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec * mounted, in which case we don't add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) return -ENOMEM; /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; /* * addr is returned from get_unmapped_area, * There are two cases: * 1> MAP_FIXED == false * unallocated memory, no need to check sealing. * 1> MAP_FIXED == true * sealing is checked inside mmap_region when * do_vmi_munmap is called. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) pkey = 0; } /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { struct inode *inode = file_inode(file); unsigned long flags_mask; int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; flags_mask = LEGACY_MAP_MASK; if (file->f_op->fop_flags & FOP_MMAP_SYNC) flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: /* * Force use of MAP_SHARED_VALIDATE with non-legacy * flags. E.g. MAP_SYNC is dangerous to use with * MAP_SHARED as you don't know which consistency model * you will get. We silently ignore unsupported flags * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; if (prot & PROT_WRITE) { if (!(file->f_mode & FMODE_WRITE)) return -EACCES; if (IS_SWAPFILE(file->f_mapping->host)) return -ETXTBSY; } /* * Make sure we don't allow writing to an append-only * file.. */ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; } if (!can_mmap_file(file)) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; break; default: return -EINVAL; } /* * Check to see if we are violating any seals and update VMA * flags if necessary to avoid future seal violations. */ err = memfd_check_seals_mmap(file, &vm_flags); if (err) return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) return -ENOTSUPP; /* * A locked or stack area makes no sense to be droppable. * * Also, since droppable pages can just go away at any time * it makes no sense to copy them on fork or dump them. * * And don't attempt to combine with hugetlb for now. */ if (flags & (MAP_LOCKED | MAP_HUGETLB)) return -EINVAL; if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) return -EINVAL; vm_flags |= VM_DROPPABLE; /* * If the pages can be dropped, then it doesn't make * sense to reserve them. */ vm_flags |= VM_NORESERVE; /* * Likewise, they're volatile enough that they * shouldn't survive forks or coredumps. */ vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; fallthrough; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } /* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */ if (flags & MAP_NORESERVE) { /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; } } else if (flags & MAP_HUGETLB) { struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); return retval; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) { struct mmap_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; if (offset_in_page(a.offset)) return -EINVAL; return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. */ static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) { if (vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } /* * Search for an unmapped address range. * * We are looking for a range that: * - does not intersect with any VMA; * - is contained within the [low_limit, high_limit) interval; * - is at least the desired size. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) */ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { unsigned long addr; if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) addr = unmapped_area_topdown(info); else addr = unmapped_area(info); trace_vm_unmapped_area(addr, info); return addr; } /* Get an address range which is currently unmapped. * For shmat() with addr=0. * * Ugly calling convention alert: * Return value with the low bits set means error value, * ie * if (ret & ~PAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. */ unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); return vm_unmapped_area(&info); } #ifndef HAVE_ARCH_UNMAPPED_AREA unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } #endif /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } return addr; } #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); } #endif unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { if (mm_flags_test(MMF_TOPDOWN, mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ if (len > TASK_SIZE) return -ENOMEM; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. */ get_area = shmem_get_unmapped_area; } /* Always treat pgoff as zero for anonymous memory. */ if (!file) pgoff = 0; if (get_area) { addr = get_area(file, addr, len, pgoff, flags); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; error = security_mmap_addr(addr); return error ? error : addr; } unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area_vmflags(mm, file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. * @start_addr: The inclusive start user address. * @end_addr: The exclusive end user address. * * Returns: The first VMA within the provided range, %NULL otherwise. Assumes * start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { unsigned long index = start_addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check * @addr: The address * * Returns: The VMA associated with addr, or the next VMA. * May return %NULL in the case of no VMA at addr or above. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { unsigned long index = addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); /** * find_vma_prev() - Find the VMA for a given address, or the next vma and * set %pprev to the previous VMA, if any. * @mm: The mm_struct to check * @addr: The address * @pprev: The pointer to set to the previous VMA * * Note that RCU lock is missing here since the external mmap_lock() is used * instead. * * Returns: The VMA associated with @addr, or the next vma. * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, addr); vma = vma_iter_load(&vmi); *pprev = vma_prev(&vmi); if (!vma) vma = vma_next(&vmi); return vma; } /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; static int __init cmdline_parse_stack_guard_gap(char *p) { unsigned long val; char *endptr; val = simple_strtoul(p, &endptr, 10); if (!*endptr) stack_guard_gap = val << PAGE_SHIFT; return 1; } __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; addr &= PAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; if (!prev) return NULL; if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_downwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; addr &= PAGE_MASK; vma = find_vma(mm, addr); if (!vma) return NULL; if (vma->vm_start <= addr) return vma; start = vma->vm_start; if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif #if defined(CONFIG_STACK_GROWSUP) #define vma_expand_up(vma,addr) expand_upwards(vma, addr) #define vma_expand_down(vma, addr) (-EFAULT) #else #define vma_expand_up(vma,addr) (-EFAULT) #define vma_expand_down(vma, addr) expand_downwards(vma, addr) #endif /* * expand_stack(): legacy interface for page faulting. Don't use unless * you have to. * * This is called with the mm locked for reading, drops the lock, takes * the lock for writing, tries to look up a vma again, expands it if * necessary, and downgrades the lock to reading again. * * If no vma is found or it can't be expanded, it returns NULL and has * dropped the lock. */ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; mmap_read_unlock(mm); if (mmap_write_lock_killable(mm)) return NULL; vma = find_vma_prev(mm, addr, &prev); if (vma && vma->vm_start <= addr) goto success; if (prev && !vma_expand_up(prev, addr)) { vma = prev; goto success; } if (vma && !vma_expand_down(vma, addr)) goto success; mmap_write_unlock(mm); return NULL; success: mmap_write_downgrade(mm); return vma; } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap * @len: The length to be munmapped. * @uf: The userfaultfd list_head * * Return: 0 on success, error otherwise. */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { VMA_ITERATOR(vmi, mm, start); return do_vmi_munmap(&vmi, mm, start, len, uf, false); } int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); } EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); return __vm_munmap(addr, len, true); } /* * Emulation of deprecated remap_file_pages() syscall. */ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; vm_flags_t vm_flags; pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) return ret; start = start & PAGE_MASK; size = size & PAGE_MASK; if (start + size <= start) return ret; /* Does pgoff wrap? */ if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; if (mmap_read_lock_killable(mm)) return -EINTR; /* * Look up VMA under read lock first so we can perform the security * without holding locks (which can be problematic). We reacquire a * write lock later and check nothing changed underneath us. */ vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) { mmap_read_unlock(mm); return -EINVAL; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; /* Save vm_flags used to calculate prot and flags, and recheck later. */ vm_flags = vma->vm_flags; file = get_file(vma->vm_file); mmap_read_unlock(mm); /* Call outside mmap_lock to be consistent with other callers. */ ret = security_mmap_file(file, prot, flags); if (ret) { fput(file); return ret; } ret = -EINVAL; /* OK security check passed, take write lock + let it rip. */ if (mmap_write_lock_killable(mm)) { fput(file); return -EINTR; } vma = vma_lookup(mm, start); if (!vma) goto out; /* Make sure things didn't change under us. */ if (vma->vm_flags != vm_flags) goto out; if (vma->vm_file != file) goto out; if (start + size > vma->vm_end) { VMA_ITERATOR(vmi, mm, vma->vm_end); struct vm_area_struct *next, *prev = vma; for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) goto out; if (next->vm_flags != vma->vm_flags) goto out; if (start + size <= next->vm_end) break; prev = next; } if (!next) goto out; } ret = do_mmap(vma->vm_file, start, size, prot, flags, 0, pgoff, &populate, NULL); out: mmap_write_unlock(mm); fput(file); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) ret = 0; return ret; } int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) return -ENOMEM; if (!len) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((vm_flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; ret = check_brk_limits(addr, len); if (ret) goto limits_failed; ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; vma = vma_prev(&vmi); ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; munmap_failed: limits_failed: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(vm_brk_flags); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); mmap_read_lock(mm); arch_exit_mmap(mm); vma = vma_next(&vmi); if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); goto destroy; } flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* * Walk the list again, actually closing and freeing it, with preemption * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ vma_iter_set(&vmi, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma_mark_detached(vma); remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } /* * Return true if the calling process may expand its vm space by the passed * number of pages */ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA), ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); if (!ignore_rlimit_data) return false; } return true; } void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; else if (is_stack_mapping(flags)) mm->stack_vm += npages; else if (is_data_mapping(flags)) mm->data_vm += npages; } static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Close hook, called for unmap() and on the old vma for mremap(). * * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { const struct vm_special_mapping *sm = vma->vm_private_data; if (sm->close) sm->close(sm, vma); } static const char *special_mapping_name(struct vm_area_struct *vma) { return ((struct vm_special_mapping *)vma->vm_private_data)->name; } static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; if (sm->mremap) return sm->mremap(sm, new_vma); return 0; } static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) { /* * Forbid splitting special mappings - kernel has expectations over * the number of pages in mapping. Together with VM_DONTEXPAND * the size of vma should stay the same over the special mapping's * lifetime. */ return -EINVAL; } static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, /* vDSO code relies that VVAR can't be accessed remotely */ .access = NULL, .may_split = special_mapping_split, }; static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; if (*pages) { struct page *page = *pages; get_page(page); vmf->page = page; return 0; } return VM_FAULT_SIGBUS; } static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, vm_flags_t vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); vm_flags_init(vma, (vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) goto out; vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); return vma; out: vm_area_free(vma); return ERR_PTR(ret); } bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { return vma->vm_private_data == sm && vma->vm_ops == &special_mapping_vmops; } /* * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. * The region past the last page supplied will always produce SIGBUS. * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, vm_flags_t vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); } #ifdef CONFIG_SYSCTL #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif static const struct ctl_table mmap_table[] = { { .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS { .procname = "mmap_rnd_bits", .data = &mmap_rnd_bits, .maxlen = sizeof(mmap_rnd_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_bits_min, .extra2 = (void *)&mmap_rnd_bits_max, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS { .procname = "mmap_rnd_compat_bits", .data = &mmap_rnd_compat_bits, .maxlen = sizeof(mmap_rnd_compat_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif }; #endif /* CONFIG_SYSCTL */ /* * initialise the percpu counter for VM, initialise VMA state. */ void __init mmap_init(void) { int ret; ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif vma_state_init(); } /* * Initialise sysctl_user_reserve_kbytes. * * This is intended to prevent a user from starting a single memory hogging * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER * mode. * * The default value is min(3% of free memory, 128MB) * 128MB is enough to recover with sshd/login, bash, and top/kill. */ static int init_user_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. * * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin * to log in and kill a memory hogging process. * * Systems with more than 256MB will reserve 8MB, enough to recover * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will * only reserve 3% of free pages by default. */ static int init_admin_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. * * The default user reserve max is 128MB, and the default max for the * admin reserve is 8MB. These are usually, but not always, enough to * enable recovery from a memory hogging process using login/sshd, a shell, * and tools like top. It may make sense to increase or even disable the * reserve depending on the existence of swap or variations in the recovery * tools. So, the admin may have changed them. * * If memory is added and the reserves have been eliminated or increased above * the default max, then we'll trust the admin. * * If memory is removed and there isn't enough free memory, then we * need to reset the reserves. * * Otherwise keep the reserve set by the admin. */ static int reserve_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { unsigned long tmp, free_kbytes; switch (action) { case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; case MEM_OFFLINE: free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); pr_info("vm.user_reserve_kbytes reset to %lu\n", sysctl_user_reserve_kbytes); } if (sysctl_admin_reserve_kbytes > free_kbytes) { init_admin_reserve(); pr_info("vm.admin_reserve_kbytes reset to %lu\n", sysctl_admin_reserve_kbytes); } break; default: break; } return NOTIFY_OK; } static int __meminit init_reserve_notifier(void) { if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } subsys_initcall(init_reserve_notifier); /* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a * downward-growing stack, then attempt to expand the stack to contain it. * * This function is intended only for obtaining an argument page from an ELF * image, and is almost certainly NOT what you want to use for any other * purpose. * * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the * VMA referenced must not be linked in any user-visible tree, i.e. it must be a * new VMA being mapped. * * The function assumes that addr is either contained within the VMA or below * it, and makes no attempt to validate this value beyond that. * * Returns true if the read lock was obtained and a stack was perhaps expanded, * false if the stack expansion failed. * * On stack expansion the function temporarily acquires an mmap write lock * before downgrading it. */ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *new_vma, unsigned long addr, bool write) { if (!write || addr >= new_vma->vm_start) { mmap_read_lock(mm); return true; } if (!(new_vma->vm_flags & VM_GROWSDOWN)) return false; mmap_write_lock(mm); if (expand_downwards(new_vma, addr)) { mmap_write_unlock(mm); return false; } mmap_write_downgrade(mm); return true; } __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); if (mmap_write_lock_killable(oldmm)) return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); for_each_vma(vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, mpnt->vm_end, GFP_KERNEL); if (retval) goto loop_out; vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; /* * Don't duplicate many vmas if we've been oom-killed (for * example) */ if (fatal_signal_pending(current)) { retval = -EINTR; goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { /* * VM_WIPEONFORK gets a clean slate in the child. * Don't prepare anon_vma until fault since we don't * copy page for current vma. */ tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); /* * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); /* * Link the vma into the MT. After using __mt_dup(), memory * allocation is not necessary here, so it cannot fail. */ vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; get_file(file); i_mmap_lock_write(mapping); if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); if (retval) { mpnt = vma_next(&vmi); goto loop_out; } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { /* * The entire maple tree has already been duplicated. If the * mmap duplication fails, mark the failure point with * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, * stop releasing VMAs that have not been duplicated after this * point. */ if (mpnt) { mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); if (!retval) dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); return retval; fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto loop_out; }
4 5 3 4 4 5 10 4 6 9 10 4 3 4 6 1 3 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for x86_64/AVX2 assembler optimized version of Serpent * * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ #include <linux/module.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/serpent.h> #include "serpent-avx.h" #include "ecb_cbc_helpers.h" #define SERPENT_AVX2_PARALLEL_BLOCKS 16 /* 16-way AVX2 parallel cipher functions */ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); ECB_BLOCK(1, __serpent_encrypt); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); ECB_BLOCK(1, __serpent_decrypt); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); CBC_ENC_BLOCK(__serpent_encrypt); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way); CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); CBC_DEC_BLOCK(1, __serpent_decrypt); CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "ecb(serpent)", .base.cra_driver_name = "ecb-serpent-avx2", .base.cra_priority = 600, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "cbc(serpent)", .base.cra_driver_name = "cbc-serpent-avx2", .base.cra_priority = 600, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .ivsize = SERPENT_BLOCK_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, }; static int __init serpent_avx2_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return crypto_register_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } static void __exit serpent_avx2_fini(void) { crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_avx2_init); module_exit(serpent_avx2_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); MODULE_ALIAS_CRYPTO("serpent"); MODULE_ALIAS_CRYPTO("serpent-asm");
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 /* * Glue Code for assembler optimized version of TWOFISH * * Originally Twofish for GPG * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 * 256-bit key length added March 20, 1999 * Some modifications to reduce the text size by Werner Koch, April, 1998 * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com> * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net> * * The original author has disclaimed all copyright interest in this * code and thus put it in the public domain. The subsequent authors * have put this under the GNU General Public License. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * * This code is a "clean room" implementation, written from the paper * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey, * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available * through http://www.counterpane.com/twofish.html * * For background information on multiplication in finite fields, used for * the matrix operations in the key schedule, see the book _Contemporary * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the * Third Edition. */ #include <crypto/algapi.h> #include <crypto/twofish.h> #include <linux/export.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(twofish_enc_blk); asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(twofish_dec_blk); static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { twofish_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { twofish_dec_blk(crypto_tfm_ctx(tfm), dst, src); } static struct crypto_alg alg = { .cra_name = "twofish", .cra_driver_name = "twofish-asm", .cra_priority = 200, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = TF_MIN_KEY_SIZE, .cia_max_keysize = TF_MAX_KEY_SIZE, .cia_setkey = twofish_setkey, .cia_encrypt = twofish_encrypt, .cia_decrypt = twofish_decrypt } } }; static int __init twofish_glue_init(void) { return crypto_register_alg(&alg); } static void __exit twofish_glue_fini(void) { crypto_unregister_alg(&alg); } module_init(twofish_glue_init); module_exit(twofish_glue_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized"); MODULE_ALIAS_CRYPTO("twofish"); MODULE_ALIAS_CRYPTO("twofish-asm");
4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 // SPDX-License-Identifier: GPL-2.0 /* * ASCII values for a number of symbolic constants, printing functions, * etc. * Additions for SCSI 2 and Linux 2.2.x by D. Gilbert (990422) * Additions for SCSI 3+ (SPC-3 T10/1416-D Rev 07 3 May 2002) * by D. Gilbert and aeb (20020609) * Updated to SPC-4 T10/1713-D Rev 36g, D. Gilbert 20130701 */ #include <linux/blkdev.h> #include <linux/module.h> #include <linux/kernel.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_dbg.h> /* Commands with service actions that change the command name */ #define THIRD_PARTY_COPY_OUT 0x83 #define THIRD_PARTY_COPY_IN 0x84 struct sa_name_list { int opcode; const struct value_name_pair *arr; int arr_sz; }; struct value_name_pair { int value; const char * name; }; static const char * cdb_byte0_names[] = { /* 00-03 */ "Test Unit Ready", "Rezero Unit/Rewind", NULL, "Request Sense", /* 04-07 */ "Format Unit/Medium", "Read Block Limits", NULL, "Reassign Blocks", /* 08-0d */ "Read(6)", NULL, "Write(6)", "Seek(6)", NULL, NULL, /* 0e-12 */ NULL, "Read Reverse", "Write Filemarks", "Space", "Inquiry", /* 13-16 */ "Verify(6)", "Recover Buffered Data", "Mode Select(6)", "Reserve(6)", /* 17-1a */ "Release(6)", "Copy", "Erase", "Mode Sense(6)", /* 1b-1d */ "Start/Stop Unit", "Receive Diagnostic", "Send Diagnostic", /* 1e-1f */ "Prevent/Allow Medium Removal", NULL, /* 20-22 */ NULL, NULL, NULL, /* 23-28 */ "Read Format Capacities", "Set Window", "Read Capacity(10)", NULL, NULL, "Read(10)", /* 29-2d */ "Read Generation", "Write(10)", "Seek(10)", "Erase(10)", "Read updated block", /* 2e-31 */ "Write Verify(10)", "Verify(10)", "Search High", "Search Equal", /* 32-34 */ "Search Low", "Set Limits", "Prefetch/Read Position", /* 35-37 */ "Synchronize Cache(10)", "Lock/Unlock Cache(10)", "Read Defect Data(10)", /* 38-3c */ "Medium Scan", "Compare", "Copy Verify", "Write Buffer", "Read Buffer", /* 3d-3f */ "Update Block", "Read Long(10)", "Write Long(10)", /* 40-41 */ "Change Definition", "Write Same(10)", /* 42-48 */ "Unmap/Read sub-channel", "Read TOC/PMA/ATIP", "Read density support", "Play audio(10)", "Get configuration", "Play audio msf", "Sanitize/Play audio track/index", /* 49-4f */ "Play track relative(10)", "Get event status notification", "Pause/resume", "Log Select", "Log Sense", "Stop play/scan", NULL, /* 50-55 */ "Xdwrite", "Xpwrite, Read disk info", "Xdread, Read track info", "Reserve track", "Send OPC info", "Mode Select(10)", /* 56-5b */ "Reserve(10)", "Release(10)", "Repair track", "Read master cue", "Mode Sense(10)", "Close track/session", /* 5c-5f */ "Read buffer capacity", "Send cue sheet", "Persistent reserve in", "Persistent reserve out", /* 60-67 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 68-6f */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 70-77 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 78-7f */ NULL, NULL, NULL, NULL, NULL, NULL, "Extended CDB", "Variable length", /* 80-84 */ "Xdwrite(16)", "Rebuild(16)", "Regenerate(16)", "Third party copy out", "Third party copy in", /* 85-89 */ "ATA command pass through(16)", "Access control in", "Access control out", "Read(16)", "Compare and Write", /* 8a-8f */ "Write(16)", "ORWrite", "Read attributes", "Write attributes", "Write and verify(16)", "Verify(16)", /* 90-94 */ "Pre-fetch(16)", "Synchronize cache(16)", "Lock/unlock cache(16)", "Write same(16)", NULL, /* 95-99 */ NULL, NULL, NULL, NULL, NULL, /* 9a-9f */ NULL, NULL, NULL, "Service action bidirectional", "Service action in(16)", "Service action out(16)", /* a0-a5 */ "Report luns", "ATA command pass through(12)/Blank", "Security protocol in", "Maintenance in", "Maintenance out", "Move medium/play audio(12)", /* a6-a9 */ "Exchange medium", "Move medium attached", "Read(12)", "Play track relative(12)", /* aa-ae */ "Write(12)", NULL, "Erase(12), Get Performance", "Read DVD structure", "Write and verify(12)", /* af-b1 */ "Verify(12)", "Search data high(12)", "Search data equal(12)", /* b2-b4 */ "Search data low(12)", "Set limits(12)", "Read element status attached", /* b5-b6 */ "Security protocol out", "Send volume tag, set streaming", /* b7-b9 */ "Read defect data(12)", "Read element status", "Read CD msf", /* ba-bc */ "Redundancy group (in), Scan", "Redundancy group (out), Set cd-rom speed", "Spare (in), Play cd", /* bd-bf */ "Spare (out), Mechanism status", "Volume set (in), Read cd", "Volume set (out), Send DVD structure", }; static const struct value_name_pair maint_in_arr[] = { {0x5, "Report identifying information"}, {0xa, "Report target port groups"}, {0xb, "Report aliases"}, {0xc, "Report supported operation codes"}, {0xd, "Report supported task management functions"}, {0xe, "Report priority"}, {0xf, "Report timestamp"}, {0x10, "Management protocol in"}, }; #define MAINT_IN_SZ ARRAY_SIZE(maint_in_arr) static const struct value_name_pair maint_out_arr[] = { {0x6, "Set identifying information"}, {0xa, "Set target port groups"}, {0xb, "Change aliases"}, {0xc, "Remove I_T nexus"}, {0xe, "Set priority"}, {0xf, "Set timestamp"}, {0x10, "Management protocol out"}, }; #define MAINT_OUT_SZ ARRAY_SIZE(maint_out_arr) static const struct value_name_pair serv_in12_arr[] = { {0x1, "Read media serial number"}, }; #define SERV_IN12_SZ ARRAY_SIZE(serv_in12_arr) static const struct value_name_pair serv_out12_arr[] = { {-1, "dummy entry"}, }; #define SERV_OUT12_SZ ARRAY_SIZE(serv_out12_arr) static const struct value_name_pair serv_bidi_arr[] = { {-1, "dummy entry"}, }; #define SERV_BIDI_SZ ARRAY_SIZE(serv_bidi_arr) static const struct value_name_pair serv_in16_arr[] = { {0x10, "Read capacity(16)"}, {0x11, "Read long(16)"}, {0x12, "Get LBA status"}, {0x13, "Report referrals"}, }; #define SERV_IN16_SZ ARRAY_SIZE(serv_in16_arr) static const struct value_name_pair serv_out16_arr[] = { {0x11, "Write long(16)"}, {0x1f, "Notify data transfer device(16)"}, }; #define SERV_OUT16_SZ ARRAY_SIZE(serv_out16_arr) static const struct value_name_pair pr_in_arr[] = { {0x0, "Persistent reserve in, read keys"}, {0x1, "Persistent reserve in, read reservation"}, {0x2, "Persistent reserve in, report capabilities"}, {0x3, "Persistent reserve in, read full status"}, }; #define PR_IN_SZ ARRAY_SIZE(pr_in_arr) static const struct value_name_pair pr_out_arr[] = { {0x0, "Persistent reserve out, register"}, {0x1, "Persistent reserve out, reserve"}, {0x2, "Persistent reserve out, release"}, {0x3, "Persistent reserve out, clear"}, {0x4, "Persistent reserve out, preempt"}, {0x5, "Persistent reserve out, preempt and abort"}, {0x6, "Persistent reserve out, register and ignore existing key"}, {0x7, "Persistent reserve out, register and move"}, }; #define PR_OUT_SZ ARRAY_SIZE(pr_out_arr) /* SPC-4 rev 34 renamed the Extended Copy opcode to Third Party Copy Out. LID1 (List Identifier length: 1 byte) is the Extended Copy found in SPC-2 and SPC-3 */ static const struct value_name_pair tpc_out_arr[] = { {0x0, "Extended copy(LID1)"}, {0x1, "Extended copy(LID4)"}, {0x10, "Populate token"}, {0x11, "Write using token"}, {0x1c, "Copy operation abort"}, }; #define TPC_OUT_SZ ARRAY_SIZE(tpc_out_arr) static const struct value_name_pair tpc_in_arr[] = { {0x0, "Receive copy status(LID1)"}, {0x1, "Receive copy data(LID1)"}, {0x3, "Receive copy operating parameters"}, {0x4, "Receive copy failure details(LID1)"}, {0x5, "Receive copy status(LID4)"}, {0x6, "Receive copy data(LID4)"}, {0x7, "Receive ROD token information"}, {0x8, "Report all ROD tokens"}, }; #define TPC_IN_SZ ARRAY_SIZE(tpc_in_arr) static const struct value_name_pair variable_length_arr[] = { {0x1, "Rebuild(32)"}, {0x2, "Regenerate(32)"}, {0x3, "Xdread(32)"}, {0x4, "Xdwrite(32)"}, {0x5, "Xdwrite extended(32)"}, {0x6, "Xpwrite(32)"}, {0x7, "Xdwriteread(32)"}, {0x8, "Xdwrite extended(64)"}, {0x9, "Read(32)"}, {0xa, "Verify(32)"}, {0xb, "Write(32)"}, {0xc, "Write an verify(32)"}, {0xd, "Write same(32)"}, {0x8801, "Format OSD"}, {0x8802, "Create (osd)"}, {0x8803, "List (osd)"}, {0x8805, "Read (osd)"}, {0x8806, "Write (osd)"}, {0x8807, "Append (osd)"}, {0x8808, "Flush (osd)"}, {0x880a, "Remove (osd)"}, {0x880b, "Create partition (osd)"}, {0x880c, "Remove partition (osd)"}, {0x880e, "Get attributes (osd)"}, {0x880f, "Set attributes (osd)"}, {0x8812, "Create and write (osd)"}, {0x8815, "Create collection (osd)"}, {0x8816, "Remove collection (osd)"}, {0x8817, "List collection (osd)"}, {0x8818, "Set key (osd)"}, {0x8819, "Set master key (osd)"}, {0x881a, "Flush collection (osd)"}, {0x881b, "Flush partition (osd)"}, {0x881c, "Flush OSD"}, {0x8f7e, "Perform SCSI command (osd)"}, {0x8f7f, "Perform task management function (osd)"}, }; #define VARIABLE_LENGTH_SZ ARRAY_SIZE(variable_length_arr) static struct sa_name_list sa_names_arr[] = { {VARIABLE_LENGTH_CMD, variable_length_arr, VARIABLE_LENGTH_SZ}, {MAINTENANCE_IN, maint_in_arr, MAINT_IN_SZ}, {MAINTENANCE_OUT, maint_out_arr, MAINT_OUT_SZ}, {PERSISTENT_RESERVE_IN, pr_in_arr, PR_IN_SZ}, {PERSISTENT_RESERVE_OUT, pr_out_arr, PR_OUT_SZ}, {SERVICE_ACTION_IN_12, serv_in12_arr, SERV_IN12_SZ}, {SERVICE_ACTION_OUT_12, serv_out12_arr, SERV_OUT12_SZ}, {SERVICE_ACTION_BIDIRECTIONAL, serv_bidi_arr, SERV_BIDI_SZ}, {SERVICE_ACTION_IN_16, serv_in16_arr, SERV_IN16_SZ}, {SERVICE_ACTION_OUT_16, serv_out16_arr, SERV_OUT16_SZ}, {THIRD_PARTY_COPY_IN, tpc_in_arr, TPC_IN_SZ}, {THIRD_PARTY_COPY_OUT, tpc_out_arr, TPC_OUT_SZ}, {0, NULL, 0}, }; bool scsi_opcode_sa_name(int opcode, int service_action, const char **cdb_name, const char **sa_name) { struct sa_name_list *sa_name_ptr; const struct value_name_pair *arr = NULL; int arr_sz, k; *cdb_name = NULL; if (opcode >= VENDOR_SPECIFIC_CDB) return false; if (opcode < ARRAY_SIZE(cdb_byte0_names)) *cdb_name = cdb_byte0_names[opcode]; for (sa_name_ptr = sa_names_arr; sa_name_ptr->arr; ++sa_name_ptr) { if (sa_name_ptr->opcode == opcode) { arr = sa_name_ptr->arr; arr_sz = sa_name_ptr->arr_sz; break; } } if (!arr) return false; for (k = 0; k < arr_sz; ++k, ++arr) { if (service_action == arr->value) break; } if (k < arr_sz) *sa_name = arr->name; return true; } struct error_info { unsigned short code12; /* 0x0302 looks better than 0x03,0x02 */ unsigned short size; }; /* * There are 700+ entries in this table. To save space, we don't store * (code, pointer) pairs, which would make sizeof(struct * error_info)==16 on 64 bits. Rather, the second element just stores * the size (including \0) of the corresponding string, and we use the * sum of these to get the appropriate offset into additional_text * defined below. This approach saves 12 bytes per entry. */ static const struct error_info additional[] = { #define SENSE_CODE(c, s) {c, sizeof(s)}, #include "sense_codes.h" #undef SENSE_CODE }; static const char *additional_text = #define SENSE_CODE(c, s) s "\0" #include "sense_codes.h" #undef SENSE_CODE ; struct error_info2 { unsigned char code1, code2_min, code2_max; const char * str; const char * fmt; }; static const struct error_info2 additional2[] = { {0x40, 0x00, 0x7f, "Ram failure", ""}, {0x40, 0x80, 0xff, "Diagnostic failure on component", ""}, {0x41, 0x00, 0xff, "Data path failure", ""}, {0x42, 0x00, 0xff, "Power-on or self-test failure", ""}, {0x4D, 0x00, 0xff, "Tagged overlapped commands", "task tag "}, {0x70, 0x00, 0xff, "Decompression exception", "short algorithm id of "}, {0, 0, 0, NULL, NULL} }; /* description of the sense key values */ static const char * const snstext[] = { "No Sense", /* 0: There is no sense information */ "Recovered Error", /* 1: The last command completed successfully but used error correction */ "Not Ready", /* 2: The addressed target is not ready */ "Medium Error", /* 3: Data error detected on the medium */ "Hardware Error", /* 4: Controller or device failure */ "Illegal Request", /* 5: Error in request */ "Unit Attention", /* 6: Removable medium was changed, or the target has been reset, or ... */ "Data Protect", /* 7: Access to the data is blocked */ "Blank Check", /* 8: Reached unexpected written or unwritten region of the medium */ "Vendor Specific(9)", "Copy Aborted", /* A: COPY or COMPARE was aborted */ "Aborted Command", /* B: The target aborted the command */ "Equal", /* C: A SEARCH DATA command found data equal, reserved in SPC-4 rev 36 */ "Volume Overflow", /* D: Medium full with still data to be written */ "Miscompare", /* E: Source data and data on the medium do not agree */ "Completed", /* F: command completed sense data reported, may occur for successful command */ }; /* Get sense key string or NULL if not available */ const char * scsi_sense_key_string(unsigned char key) { if (key < ARRAY_SIZE(snstext)) return snstext[key]; return NULL; } EXPORT_SYMBOL(scsi_sense_key_string); /* * Get additional sense code string or NULL if not available. * This string may contain a "%x" and should be printed with ascq as arg. */ const char * scsi_extd_sense_format(unsigned char asc, unsigned char ascq, const char **fmt) { int i; unsigned short code = ((asc << 8) | ascq); unsigned offset = 0; *fmt = NULL; for (i = 0; i < ARRAY_SIZE(additional); i++) { if (additional[i].code12 == code) return additional_text + offset; offset += additional[i].size; } for (i = 0; additional2[i].fmt; i++) { if (additional2[i].code1 == asc && ascq >= additional2[i].code2_min && ascq <= additional2[i].code2_max) { *fmt = additional2[i].fmt; return additional2[i].str; } } return NULL; } EXPORT_SYMBOL(scsi_extd_sense_format); static const char * const hostbyte_table[]={ "DID_OK", "DID_NO_CONNECT", "DID_BUS_BUSY", "DID_TIME_OUT", "DID_BAD_TARGET", "DID_ABORT", "DID_PARITY", "DID_ERROR", "DID_RESET", "DID_BAD_INTR", "DID_PASSTHROUGH", "DID_SOFT_ERROR", "DID_IMM_RETRY", "DID_REQUEUE", "DID_TRANSPORT_DISRUPTED", "DID_TRANSPORT_FAILFAST", "DID_TARGET_FAILURE", "DID_NEXUS_FAILURE", "DID_ALLOC_FAILURE", "DID_MEDIUM_ERROR" }; const char *scsi_hostbyte_string(int result) { enum scsi_host_status hb = host_byte(result); const char *hb_string = NULL; if (hb < ARRAY_SIZE(hostbyte_table)) hb_string = hostbyte_table[hb]; return hb_string; } EXPORT_SYMBOL(scsi_hostbyte_string); #define scsi_mlreturn_name(result) { result, #result } static const struct value_name_pair scsi_mlreturn_arr[] = { scsi_mlreturn_name(NEEDS_RETRY), scsi_mlreturn_name(SUCCESS), scsi_mlreturn_name(FAILED), scsi_mlreturn_name(QUEUED), scsi_mlreturn_name(SOFT_ERROR), scsi_mlreturn_name(ADD_TO_MLQUEUE), scsi_mlreturn_name(TIMEOUT_ERROR), scsi_mlreturn_name(SCSI_RETURN_NOT_HANDLED), scsi_mlreturn_name(FAST_IO_FAIL) }; const char *scsi_mlreturn_string(int result) { const struct value_name_pair *arr = scsi_mlreturn_arr; int k; for (k = 0; k < ARRAY_SIZE(scsi_mlreturn_arr); ++k, ++arr) { if (result == arr->value) return arr->name; } return NULL; } EXPORT_SYMBOL(scsi_mlreturn_string);
52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 debugfs for wireless PHYs * * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2018 - 2019, 2021-2025 Intel Corporation */ #include <linux/debugfs.h> #include <linux/rtnetlink.h> #include <linux/vmalloc.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "debugfs.h" #define DEBUGFS_FORMAT_BUFFER_SIZE 100 int mac80211_format_buffer(char __user *userbuf, size_t count, loff_t *ppos, char *fmt, ...) { va_list args; char buf[DEBUGFS_FORMAT_BUFFER_SIZE]; int res; va_start(args, fmt); res = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); return simple_read_from_buffer(userbuf, count, ppos, buf, res); } #define DEBUGFS_READONLY_FILE_FN(name, fmt, value...) \ static ssize_t name## _read(struct file *file, char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct ieee80211_local *local = file->private_data; \ \ return mac80211_format_buffer(userbuf, count, ppos, \ fmt "\n", ##value); \ } #define DEBUGFS_READONLY_FILE_OPS(name) \ static const struct debugfs_short_fops name## _ops = { \ .read = name## _read, \ .llseek = generic_file_llseek, \ }; #define DEBUGFS_READONLY_FILE(name, fmt, value...) \ DEBUGFS_READONLY_FILE_FN(name, fmt, value) \ DEBUGFS_READONLY_FILE_OPS(name) #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, phyd, local, &name## _ops) #define DEBUGFS_ADD_MODE(name, mode) \ debugfs_create_file(#name, mode, phyd, local, &name## _ops); DEBUGFS_READONLY_FILE(hw_conf, "%x", local->hw.conf.flags); DEBUGFS_READONLY_FILE(user_power, "%d", local->user_power_level); DEBUGFS_READONLY_FILE(power, "%d", local->hw.conf.power_level); DEBUGFS_READONLY_FILE(total_ps_buffered, "%d", local->total_ps_buffered); DEBUGFS_READONLY_FILE(wep_iv, "%#08x", local->wep_iv & 0xffffff); DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); static ssize_t aqm_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; struct fq *fq = &local->fq; char buf[200]; int len = 0; spin_lock_bh(&local->fq.lock); len = scnprintf(buf, sizeof(buf), "access name value\n" "R fq_flows_cnt %u\n" "R fq_backlog %u\n" "R fq_overlimit %u\n" "R fq_overmemory %u\n" "R fq_collisions %u\n" "R fq_memory_usage %u\n" "RW fq_memory_limit %u\n" "RW fq_limit %u\n" "RW fq_quantum %u\n", fq->flows_cnt, fq->backlog, fq->overmemory, fq->overlimit, fq->collisions, fq->memory_usage, fq->memory_limit, fq->limit, fq->quantum); spin_unlock_bh(&local->fq.lock); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aqm_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[100]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; else if (sscanf(buf, "fq_memory_limit %u", &local->fq.memory_limit) == 1) return count; else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1) return count; return -EINVAL; } static const struct debugfs_short_fops aqm_ops = { .write = aqm_write, .read = aqm_read, .llseek = default_llseek, }; static ssize_t airtime_flags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[128] = {}, *pos, *end; pos = buf; end = pos + sizeof(buf) - 1; if (local->airtime_flags & AIRTIME_USE_TX) pos += scnprintf(pos, end - pos, "AIRTIME_TX\t(%lx)\n", AIRTIME_USE_TX); if (local->airtime_flags & AIRTIME_USE_RX) pos += scnprintf(pos, end - pos, "AIRTIME_RX\t(%lx)\n", AIRTIME_USE_RX); return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } static ssize_t airtime_flags_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[16]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (kstrtou16(buf, 0, &local->airtime_flags)) return -EINVAL; return count; } static const struct debugfs_short_fops airtime_flags_ops = { .write = airtime_flags_write, .read = airtime_flags_read, .llseek = default_llseek, }; static ssize_t aql_pending_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[400]; int len = 0; len = scnprintf(buf, sizeof(buf), "AC AQL pending\n" "VO %u us\n" "VI %u us\n" "BE %u us\n" "BK %u us\n" "total %u us\n", atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VO]), atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VI]), atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BE]), atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BK]), atomic_read(&local->aql_total_pending_airtime)); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static const struct debugfs_short_fops aql_pending_ops = { .read = aql_pending_read, .llseek = default_llseek, }; static ssize_t aql_txq_limit_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[400]; int len = 0; len = scnprintf(buf, sizeof(buf), "AC AQL limit low AQL limit high\n" "VO %u %u\n" "VI %u %u\n" "BE %u %u\n" "BK %u %u\n", local->aql_txq_limit_low[IEEE80211_AC_VO], local->aql_txq_limit_high[IEEE80211_AC_VO], local->aql_txq_limit_low[IEEE80211_AC_VI], local->aql_txq_limit_high[IEEE80211_AC_VI], local->aql_txq_limit_low[IEEE80211_AC_BE], local->aql_txq_limit_high[IEEE80211_AC_BE], local->aql_txq_limit_low[IEEE80211_AC_BK], local->aql_txq_limit_high[IEEE80211_AC_BK]); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aql_txq_limit_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[100]; u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; struct sta_info *sta; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) return -EINVAL; if (ac >= IEEE80211_NUM_ACS) return -EINVAL; q_limit_low_old = local->aql_txq_limit_low[ac]; q_limit_high_old = local->aql_txq_limit_high[ac]; guard(wiphy)(local->hw.wiphy); local->aql_txq_limit_low[ac] = q_limit_low; local->aql_txq_limit_high[ac] = q_limit_high; list_for_each_entry(sta, &local->sta_list, list) { /* If a sta has customized queue limits, keep it */ if (sta->airtime[ac].aql_limit_low == q_limit_low_old && sta->airtime[ac].aql_limit_high == q_limit_high_old) { sta->airtime[ac].aql_limit_low = q_limit_low; sta->airtime[ac].aql_limit_high = q_limit_high; } } return count; } static const struct debugfs_short_fops aql_txq_limit_ops = { .write = aql_txq_limit_write, .read = aql_txq_limit_read, .llseek = default_llseek, }; static ssize_t aql_enable_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; int len; len = scnprintf(buf, sizeof(buf), "%d\n", !static_key_false(&aql_disable.key)); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aql_enable_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool aql_disabled = static_key_false(&aql_disable.key); char buf[3]; size_t len; if (count > sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; buf[sizeof(buf) - 1] = '\0'; len = strlen(buf); if (len > 0 && buf[len - 1] == '\n') buf[len - 1] = 0; if (buf[0] == '0' && buf[1] == '\0') { if (!aql_disabled) static_branch_inc(&aql_disable); } else if (buf[0] == '1' && buf[1] == '\0') { if (aql_disabled) static_branch_dec(&aql_disable); } else { return -EINVAL; } return count; } static const struct debugfs_short_fops aql_enable_ops = { .write = aql_enable_write, .read = aql_enable_read, .llseek = default_llseek, }; static ssize_t force_tx_status_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[3]; int len = 0; len = scnprintf(buf, sizeof(buf), "%d\n", (int)local->force_tx_status); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t force_tx_status_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[3]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (buf[0] == '0' && buf[1] == '\0') local->force_tx_status = 0; else if (buf[0] == '1' && buf[1] == '\0') local->force_tx_status = 1; else return -EINVAL; return count; } static const struct debugfs_short_fops force_tx_status_ops = { .write = force_tx_status_write, .read = force_tx_status_read, .llseek = default_llseek, }; #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; int ret; rtnl_lock(); wiphy_lock(local->hw.wiphy); __ieee80211_suspend(&local->hw, NULL); ret = __ieee80211_resume(&local->hw); wiphy_unlock(local->hw.wiphy); if (ret) cfg80211_shutdown_all_interfaces(local->hw.wiphy); rtnl_unlock(); return count; } static const struct debugfs_short_fops reset_ops = { .write = reset_write, .llseek = noop_llseek, }; #endif static const char *hw_flag_names[] = { #define FLAG(F) [IEEE80211_HW_##F] = #F FLAG(HAS_RATE_CONTROL), FLAG(RX_INCLUDES_FCS), FLAG(HOST_BROADCAST_PS_BUFFERING), FLAG(SIGNAL_UNSPEC), FLAG(SIGNAL_DBM), FLAG(NEED_DTIM_BEFORE_ASSOC), FLAG(SPECTRUM_MGMT), FLAG(AMPDU_AGGREGATION), FLAG(SUPPORTS_PS), FLAG(PS_NULLFUNC_STACK), FLAG(SUPPORTS_DYNAMIC_PS), FLAG(MFP_CAPABLE), FLAG(WANT_MONITOR_VIF), FLAG(NO_VIRTUAL_MONITOR), FLAG(NO_AUTO_VIF), FLAG(SW_CRYPTO_CONTROL), FLAG(SUPPORT_FAST_XMIT), FLAG(REPORTS_TX_ACK_STATUS), FLAG(CONNECTION_MONITOR), FLAG(QUEUE_CONTROL), FLAG(SUPPORTS_PER_STA_GTK), FLAG(AP_LINK_PS), FLAG(TX_AMPDU_SETUP_IN_HW), FLAG(SUPPORTS_RC_TABLE), FLAG(P2P_DEV_ADDR_FOR_INTF), FLAG(TIMING_BEACON_ONLY), FLAG(SUPPORTS_HT_CCK_RATES), FLAG(CHANCTX_STA_CSA), FLAG(SUPPORTS_CLONED_SKBS), FLAG(SINGLE_SCAN_ON_ALL_BANDS), FLAG(TDLS_WIDER_BW), FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), FLAG(SUPPORTS_REORDERING_BUFFER), FLAG(USES_RSS), FLAG(TX_AMSDU), FLAG(TX_FRAG_LIST), FLAG(REPORTS_LOW_ACK), FLAG(SUPPORTS_TX_FRAG), FLAG(SUPPORTS_TDLS_BUFFER_STA), FLAG(DOESNT_SUPPORT_QOS_NDP), FLAG(BUFF_MMPDU_TXQ), FLAG(SUPPORTS_VHT_EXT_NSS_BW), FLAG(STA_MMPDU_TXQ), FLAG(TX_STATUS_NO_AMPDU_LEN), FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(AMPDU_KEYBORDER_SUPPORT), FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), FLAG(SUPPORTS_RX_DECAP_OFFLOAD), FLAG(SUPPORTS_CONC_MON_RX_DECAP), FLAG(DETECTS_COLOR_COLLISION), FLAG(MLO_MCAST_MULTI_LINK_TX), FLAG(DISALLOW_PUNCTURING), FLAG(HANDLES_QUIET_CSA), FLAG(STRICT), #undef FLAG }; static ssize_t hwflags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; size_t bufsz = 30 * NUM_IEEE80211_HW_FLAGS; char *buf = kzalloc(bufsz, GFP_KERNEL); char *pos = buf, *end = buf + bufsz - 1; ssize_t rv; int i; if (!buf) return -ENOMEM; /* fail compilation if somebody adds or removes * a flag without updating the name array above */ BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS); for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) pos += scnprintf(pos, end - pos, "%s\n", hw_flag_names[i]); } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); return rv; } static ssize_t hwflags_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[100]; int val; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (sscanf(buf, "strict=%d", &val) == 1) { switch (val) { case 0: ieee80211_hw_set(&local->hw, STRICT); return count; case 1: __clear_bit(IEEE80211_HW_STRICT, local->hw.flags); return count; default: return -EINVAL; } } return -EINVAL; } static const struct file_operations hwflags_ops = { .open = simple_open, .read = hwflags_read, .write = hwflags_write, }; static ssize_t misc_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */ size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9; char *buf; char *pos, *end; ssize_t rv; int i; int ln; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; pos = buf; end = buf + bufsz - 1; pos += scnprintf(pos, end - pos, "pending:\n"); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { ln = skb_queue_len(&local->pending[i]); pos += scnprintf(pos, end - pos, "[%i] %d\n", i, ln); } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); return rv; } static ssize_t queues_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; unsigned long flags; char buf[IEEE80211_MAX_QUEUES * 20]; int q, res = 0; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (q = 0; q < local->hw.queues; q++) res += sprintf(buf + res, "%02d: %#.8lx/%d\n", q, local->queue_stop_reasons[q], skb_queue_len(&local->pending[q])); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return simple_read_from_buffer(user_buf, count, ppos, buf, res); } DEBUGFS_READONLY_FILE_OPS(queues); DEBUGFS_READONLY_FILE_OPS(misc); /* statistics stuff */ static ssize_t format_devstat_counter(struct ieee80211_local *local, char __user *userbuf, size_t count, loff_t *ppos, int (*printvalue)(struct ieee80211_low_level_stats *stats, char *buf, int buflen)) { struct ieee80211_low_level_stats stats; char buf[20]; int res; wiphy_lock(local->hw.wiphy); res = drv_get_stats(local, &stats); wiphy_unlock(local->hw.wiphy); if (res) return res; res = printvalue(&stats, buf, sizeof(buf)); return simple_read_from_buffer(userbuf, count, ppos, buf, res); } #define DEBUGFS_DEVSTATS_FILE(name) \ static int print_devstats_##name(struct ieee80211_low_level_stats *stats,\ char *buf, int buflen) \ { \ return scnprintf(buf, buflen, "%u\n", stats->name); \ } \ static ssize_t stats_ ##name## _read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ return format_devstat_counter(file->private_data, \ userbuf, \ count, \ ppos, \ print_devstats_##name); \ } \ \ static const struct debugfs_short_fops stats_ ##name## _ops = { \ .read = stats_ ##name## _read, \ .llseek = generic_file_llseek, \ }; #ifdef CONFIG_MAC80211_DEBUG_COUNTERS #define DEBUGFS_STATS_ADD(name) \ debugfs_create_u32(#name, 0400, statsd, &local->name); #endif #define DEBUGFS_DEVSTATS_ADD(name) \ debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops); DEBUGFS_DEVSTATS_FILE(dot11ACKFailureCount); DEBUGFS_DEVSTATS_FILE(dot11RTSFailureCount); DEBUGFS_DEVSTATS_FILE(dot11FCSErrorCount); DEBUGFS_DEVSTATS_FILE(dot11RTSSuccessCount); void debugfs_hw_add(struct ieee80211_local *local) { struct dentry *phyd = local->hw.wiphy->debugfsdir; struct dentry *statsd; if (!phyd) return; local->debugfs.keys = debugfs_create_dir("keys", phyd); DEBUGFS_ADD(total_ps_buffered); DEBUGFS_ADD(wep_iv); DEBUGFS_ADD(rate_ctrl_alg); DEBUGFS_ADD(queues); DEBUGFS_ADD(misc); #ifdef CONFIG_PM DEBUGFS_ADD_MODE(reset, 0200); #endif DEBUGFS_ADD_MODE(hwflags, 0600); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); DEBUGFS_ADD(hw_conf); DEBUGFS_ADD_MODE(force_tx_status, 0600); DEBUGFS_ADD_MODE(aql_enable, 0600); DEBUGFS_ADD(aql_pending); DEBUGFS_ADD_MODE(aqm, 0600); DEBUGFS_ADD_MODE(airtime_flags, 0600); DEBUGFS_ADD(aql_txq_limit); debugfs_create_u32("aql_threshold", 0600, phyd, &local->aql_threshold); statsd = debugfs_create_dir("statistics", phyd); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount); DEBUGFS_STATS_ADD(dot11FailedCount); DEBUGFS_STATS_ADD(dot11RetryCount); DEBUGFS_STATS_ADD(dot11MultipleRetryCount); DEBUGFS_STATS_ADD(dot11FrameDuplicateCount); DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); DEBUGFS_STATS_ADD(tx_handlers_queued); DEBUGFS_STATS_ADD(tx_handlers_drop_wep); DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port); DEBUGFS_STATS_ADD(rx_handlers_drop); DEBUGFS_STATS_ADD(rx_handlers_queued); DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc); DEBUGFS_STATS_ADD(rx_handlers_drop_defrag); DEBUGFS_STATS_ADD(tx_expand_skb_head); DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned); DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag); DEBUGFS_STATS_ADD(rx_handlers_fragments); DEBUGFS_STATS_ADD(tx_status_drop); #endif DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount); DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount); DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount); DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount); }
8 8 8 8 8 6 8 8 8 13 13 13 1 13 13 13 13 13 13 13 13 7 5 13 8 8 8 2 8 4 4 4 3 1 2 2 6 2 6 6 13 5 13 13 13 13 13 13 13 3 8 29 29 29 14 14 29 14 14 14 14 6 5 8 13 13 13 2 13 17 190 189 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle incoming frames * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE #include <net/netfilter/nf_queue.h> #endif #include <linux/neighbour.h> #include <net/arp.h> #include <net/dsa.h> #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" #include "br_private_tunnel.h" static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { br_drop_fake_rtable(skb); return netif_receive_skb(skb); } static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); struct net_bridge_vlan_group *vg; dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); /* Reset the offload_fwd_mark because there could be a stacked * bridge above, and it should not think this bridge it doing * that bridge's work forwarding out its ports. */ br_switchdev_frame_unmark(skb); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. */ if (!(brdev->flags & IFF_PROMISC) && !br_allowed_egress(vg, skb)) { kfree_skb(skb); return NET_RX_DROP; } indev = skb->dev; skb->dev = brdev; skb = br_handle_vlan(br, NULL, vg, skb); if (!skb) return NET_RX_DROP; /* update the multicast stats if the packet is IGMP/MLD */ br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); BR_INPUT_SKB_CB(skb)->promisc = promisc; return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); } /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; bool promisc; u16 vid = 0; u8 state; if (!p) goto drop; br = p->br; if (br_mst_is_enabled(br)) { state = BR_STATE_FORWARDING; } else { if (p->state == BR_STATE_DISABLED) { reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; } state = p->state; } brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, &state, &vlan)) goto out; if (p->flags & BR_PORT_LOCKED) { struct net_bridge_fdb_entry *fdb_src = br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); if (!fdb_src) { /* FDB miss. Create locked FDB entry if MAB is enabled * and drop the packet. */ if (p->flags & BR_PORT_MAB) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } else if (READ_ONCE(fdb_src->dst) != p || test_bit(BR_FDB_LOCAL, &fdb_src->flags)) { /* FDB mismatch. Drop the packet without roaming. */ goto drop; } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) { /* FDB match, but entry is locked. Refresh it and drop * the packet. */ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } } nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); promisc = !!(br->dev->flags & IFF_PROMISC); local_rcv = promisc; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { pkt_type = BR_PKT_BROADCAST; local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } if (state == BR_STATE_LEARNING) { reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; } BR_INPUT_SKB_CB(skb)->brdev = br->dev; BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); if (IS_ENABLED(CONFIG_INET) && (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *msg, _msg; msg = br_is_nd_neigh_msg(skb, &_msg); if (msg) br_do_suppress_nd(skb, br, vid, p, msg); } switch (pkt_type) { case BR_PKT_MULTICAST: mdst = br_mdb_entry_skb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || br_multicast_is_router(brmctx, skb) || br->dev->flags & IFF_ALLMULTI) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } mcast_hit = true; } else { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); if (unlikely(!dst && vid && br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) { dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0); if (dst && (!test_bit(BR_FDB_LOCAL, &dst->flags) || test_bit(BR_FDB_ADDED_BY_USER, &dst->flags))) dst = NULL; } break; default: break; } if (dst) { unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false, vid); else br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) return br_pass_frame_up(skb, promisc); out: return 0; drop: kfree_skb_reason(skb, reason); goto out; } EXPORT_SYMBOL_GPL(br_handle_frame_finish); static void __br_handle_local_finish(struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { __br_handle_local_finish(skb); /* return 1 to signal the okfn() was called so it's ok to use the skb */ return 1; } static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries *e = NULL; struct nf_hook_state state; unsigned int verdict, i; struct net *net; int ret; net = dev_net(skb->dev); #ifdef HAVE_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) goto frame_finish; #endif e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; nf_hook_state_init(&state, NF_BR_PRE_ROUTING, NFPROTO_BRIDGE, skb->dev, NULL, NULL, net, br_handle_frame_finish); for (i = 0; i < e->num_hook_entries; i++) { verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) { *pskb = skb; return RX_HANDLER_PASS; } break; case NF_DROP: kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; default: /* STOLEN */ return RX_HANDLER_CONSUMED; } } frame_finish: net = dev_net(skb->dev); br_handle_frame_finish(net, NULL, skb); #else br_handle_frame_finish(dev_net(skb->dev), NULL, skb); #endif return RX_HANDLER_CONSUMED; } /* Return 0 if the frame was not processed otherwise 1 * note: already called with rcu_read_lock */ static int br_process_frame_type(struct net_bridge_port *p, struct sk_buff *skb) { struct br_frame_type *tmp; hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list) if (unlikely(tmp->type == skb->protocol)) return tmp->frame_handler(p, skb); return 0; } /* * Return NULL if skb is handled * note: already called with rcu_read_lock */ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) { reason = SKB_DROP_REASON_MAC_INVALID_SOURCE; goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); br_tc_skb_miss_set(skb, false); p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p)); if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; /* * See IEEE 802.1D Table 7-10 Reserved addresses * * Assignment Value * Bridge Group Address 01-80-C2-00-00-00 * (MAC Control) 802.3 01-80-C2-00-00-01 * (Link Aggregation) 802.3 01-80-C2-00-00-02 * 802.1X PAE address 01-80-C2-00-00-03 * * 802.1AB LLDP 01-80-C2-00-00-0E * * Others reserved for future standardization */ fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, then must forward to keep loop detection */ if (p->br->stp_enabled == BR_NO_STP || fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL; goto drop; case 0x0E: /* 802.1AB LLDP */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; default: /* Allow selective forwarding for most other protocols */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; } BR_INPUT_SKB_CB(skb)->promisc = false; /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) * Thus return 1 from the okfn() to signal the skb is ok to pass */ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), NULL, skb, skb->dev, NULL, br_handle_local_finish) == 1) { return RX_HANDLER_PASS; } else { return RX_HANDLER_CONSUMED; } } if (unlikely(br_process_frame_type(p, skb))) return RX_HANDLER_PASS; forward: if (br_mst_is_enabled(p->br)) goto defer_stp_filtering; switch (p->state) { case BR_STATE_FORWARDING: case BR_STATE_LEARNING: defer_stp_filtering: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; return nf_hook_bridge_pre(skb, pskb); default: reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; drop: kfree_skb_reason(skb, reason); } return RX_HANDLER_CONSUMED; } /* This function has no purpose other than to appease the br_port_get_rcu/rtnl * helpers which identify bridged ports according to the rx_handler installed * on them (so there _needs_ to be a bridge rx_handler even if we don't need it * to do anything useful). This bridge won't support traffic to/from the stack, * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal * frames from the ETH_P_XDSA packet_type handler. */ static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb) { return RX_HANDLER_PASS; } rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) { if (netdev_uses_dsa(dev)) return br_handle_frame_dummy; return br_handle_frame; } void br_add_frame(struct net_bridge *br, struct br_frame_type *ft) { hlist_add_head_rcu(&ft->list, &br->frame_type_list); } void br_del_frame(struct net_bridge *br, struct br_frame_type *ft) { struct br_frame_type *tmp; hlist_for_each_entry(tmp, &br->frame_type_list, list) if (ft == tmp) { hlist_del_rcu(&ft->list); return; } }
11 11 11 11 10 11 11 11 3 3 3 11 11 11 11 11 11 10 10 10 10 10 10 11 11 10 3 3 3 1 1 1 11 1 10 1 1 1 1 1 3 3 3 3 3 3 3 3 3 1 1 10 10 10 10 11 11 11 11 10 10 11 4 4 4 11 11 11 11 11 11 1 1 11 11 11 11 11 11 11 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner */ /* * UBI wear-leveling sub-system. * * This sub-system is responsible for wear-leveling. It works in terms of * physical eraseblocks and erase counters and knows nothing about logical * eraseblocks, volumes, etc. From this sub-system's perspective all physical * eraseblocks are of two types - used and free. Used physical eraseblocks are * those that were "get" by the 'ubi_wl_get_peb()' function, and free physical * eraseblocks are those that were put by the 'ubi_wl_put_peb()' function. * * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter * header. The rest of the physical eraseblock contains only %0xFF bytes. * * When physical eraseblocks are returned to the WL sub-system by means of the * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is * done asynchronously in context of the per-UBI device background thread, * which is also managed by the WL sub-system. * * The wear-leveling is ensured by means of moving the contents of used * physical eraseblocks with low erase counter to free physical eraseblocks * with high erase counter. * * If the WL sub-system fails to erase a physical eraseblock, it marks it as * bad. * * This sub-system is also responsible for scrubbing. If a bit-flip is detected * in a physical eraseblock, it has to be moved. Technically this is the same * as moving it for wear-leveling reasons. * * As it was said, for the UBI sub-system all physical eraseblocks are either * "free" or "used". Free eraseblock are kept in the @wl->free RB-tree, while * used eraseblocks are kept in @wl->used, @wl->erroneous, or @wl->scrub * RB-trees, as well as (temporarily) in the @wl->pq queue. * * When the WL sub-system returns a physical eraseblock, the physical * eraseblock is protected from being moved for some "time". For this reason, * the physical eraseblock is not directly moved from the @wl->free tree to the * @wl->used tree. There is a protection queue in between where this * physical eraseblock is temporarily stored (@wl->pq). * * All this protection stuff is needed because: * o we don't want to move physical eraseblocks just after we have given them * to the user; instead, we first want to let users fill them up with data; * * o there is a chance that the user will put the physical eraseblock very * soon, so it makes sense not to move it for some time, but wait. * * Physical eraseblocks stay protected only for limited time. But the "time" is * measured in erase cycles in this case. This is implemented with help of the * protection queue. Eraseblocks are put to the tail of this queue when they * are returned by the 'ubi_wl_get_peb()', and eraseblocks are removed from the * head of the queue on each erase operation (for any eraseblock). So the * length of the queue defines how may (global) erase cycles PEBs are protected. * * To put it differently, each physical eraseblock has 2 main states: free and * used. The former state corresponds to the @wl->free tree. The latter state * is split up on several sub-states: * o the WL movement is allowed (@wl->used tree); * o the WL movement is disallowed (@wl->erroneous) because the PEB is * erroneous - e.g., there was a read error; * o the WL movement is temporarily prohibited (@wl->pq queue); * o scrubbing is needed (@wl->scrub tree). * * Depending on the sub-state, wear-leveling entries of the used physical * eraseblocks may be kept in one of those structures. * * Note, in this implementation, we keep a small in-RAM object for each physical * eraseblock. This is surely not a scalable solution. But it appears to be good * enough for moderately large flashes and it is simple. In future, one may * re-work this sub-system and make it more scalable. * * At the moment this sub-system does not utilize the sequence number, which * was introduced relatively recently. But it would be wise to do this because * the sequence number of a logical eraseblock characterizes how old is it. For * example, when we move a PEB with low erase counter, and we need to pick the * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we * pick target PEB with an average EC if our PEB is not very "old". This is a * room for future re-works of the WL sub-system. */ #include <linux/slab.h> #include <linux/crc32.h> #include <linux/freezer.h> #include <linux/kthread.h> #include "ubi.h" #include "wl.h" /* Number of physical eraseblocks reserved for wear-leveling purposes */ #define WL_RESERVED_PEBS 1 /* * Maximum difference between two erase counters. If this threshold is * exceeded, the WL sub-system starts moving data from used physical * eraseblocks with low erase counter to free physical eraseblocks with high * erase counter. */ #define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD /* * When a physical eraseblock is moved, the WL sub-system has to pick the target * physical eraseblock to move to. The simplest way would be just to pick the * one with the highest erase counter. But in certain workloads this could lead * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a * situation when the picked physical eraseblock is constantly erased after the * data is written to it. So, we have a constant which limits the highest erase * counter of the free physical eraseblock to pick. Namely, the WL sub-system * does not pick eraseblocks with erase counter greater than the lowest erase * counter plus %WL_FREE_MAX_DIFF. */ #define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD) /* * Maximum number of consecutive background thread failures which is enough to * switch to read-only mode. */ #define WL_MAX_FAILURES 32 static int self_check_ec(struct ubi_device *ubi, int pnum, int ec); static int self_check_in_wl_tree(const struct ubi_device *ubi, struct ubi_wl_entry *e, struct rb_root *root); static int self_check_in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e); /** * wl_tree_add - add a wear-leveling entry to a WL RB-tree. * @e: the wear-leveling entry to add * @root: the root of the tree * * Note, we use (erase counter, physical eraseblock number) pairs as keys in * the @ubi->used and @ubi->free RB-trees. */ static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root) { struct rb_node **p, *parent = NULL; p = &root->rb_node; while (*p) { struct ubi_wl_entry *e1; parent = *p; e1 = rb_entry(parent, struct ubi_wl_entry, u.rb); if (e->ec < e1->ec) p = &(*p)->rb_left; else if (e->ec > e1->ec) p = &(*p)->rb_right; else { ubi_assert(e->pnum != e1->pnum); if (e->pnum < e1->pnum) p = &(*p)->rb_left; else p = &(*p)->rb_right; } } rb_link_node(&e->u.rb, parent, p); rb_insert_color(&e->u.rb, root); } /** * wl_entry_destroy - destroy a wear-leveling entry. * @ubi: UBI device description object * @e: the wear-leveling entry to add * * This function destroys a wear leveling entry and removes * the reference from the lookup table. */ static void wl_entry_destroy(struct ubi_device *ubi, struct ubi_wl_entry *e) { ubi->lookuptbl[e->pnum] = NULL; kmem_cache_free(ubi_wl_entry_slab, e); } /** * do_work - do one pending work. * @ubi: UBI device description object * @executed: whether there is one work is executed * * This function returns zero in case of success and a negative error code in * case of failure. If @executed is not NULL and there is one work executed, * @executed is set as %1, otherwise @executed is set as %0. */ static int do_work(struct ubi_device *ubi, int *executed) { int err; struct ubi_work *wrk; cond_resched(); /* * @ubi->work_sem is used to synchronize with the workers. Workers take * it in read mode, so many of them may be doing works at a time. But * the queue flush code has to be sure the whole queue of works is * done, and it takes the mutex in write mode. */ down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works)) { spin_unlock(&ubi->wl_lock); up_read(&ubi->work_sem); if (executed) *executed = 0; return 0; } if (executed) *executed = 1; wrk = list_entry(ubi->works.next, struct ubi_work, list); list_del(&wrk->list); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); spin_unlock(&ubi->wl_lock); /* * Call the worker function. Do not touch the work structure * after this call as it will have been freed or reused by that * time by the worker function. */ err = wrk->func(ubi, wrk, 0); if (err) ubi_err(ubi, "work failed with error code %d", err); up_read(&ubi->work_sem); return err; } /** * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree. * @e: the wear-leveling entry to check * @root: the root of the tree * * This function returns non-zero if @e is in the @root RB-tree and zero if it * is not. */ static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root) { struct rb_node *p; p = root->rb_node; while (p) { struct ubi_wl_entry *e1; e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e->pnum == e1->pnum) { ubi_assert(e == e1); return 1; } if (e->ec < e1->ec) p = p->rb_left; else if (e->ec > e1->ec) p = p->rb_right; else { ubi_assert(e->pnum != e1->pnum); if (e->pnum < e1->pnum) p = p->rb_left; else p = p->rb_right; } } return 0; } /** * in_pq - check if a wear-leveling entry is present in the protection queue. * @ubi: UBI device description object * @e: the wear-leveling entry to check * * This function returns non-zero if @e is in the protection queue and zero * if it is not. */ static inline int in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e) { struct ubi_wl_entry *p; int i; for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) list_for_each_entry(p, &ubi->pq[i], u.list) if (p == e) return 1; return 0; } /** * prot_queue_add - add physical eraseblock to the protection queue. * @ubi: UBI device description object * @e: the physical eraseblock to add * * This function adds @e to the tail of the protection queue @ubi->pq, where * @e will stay for %UBI_PROT_QUEUE_LEN erase operations and will be * temporarily protected from the wear-leveling worker. Note, @wl->lock has to * be locked. */ static void prot_queue_add(struct ubi_device *ubi, struct ubi_wl_entry *e) { int pq_tail = ubi->pq_head - 1; if (pq_tail < 0) pq_tail = UBI_PROT_QUEUE_LEN - 1; ubi_assert(pq_tail >= 0 && pq_tail < UBI_PROT_QUEUE_LEN); list_add_tail(&e->u.list, &ubi->pq[pq_tail]); dbg_wl("added PEB %d EC %d to the protection queue", e->pnum, e->ec); } /** * find_wl_entry - find wear-leveling entry closest to certain erase counter. * @ubi: UBI device description object * @root: the RB-tree where to look for * @diff: maximum possible difference from the smallest erase counter * @pick_max: pick PEB even its erase counter beyonds 'min_ec + @diff' * * This function looks for a wear leveling entry with erase counter closest to * min + @diff, where min is the smallest erase counter. */ static struct ubi_wl_entry *find_wl_entry(struct ubi_device *ubi, struct rb_root *root, int diff, int pick_max) { struct rb_node *p; struct ubi_wl_entry *e; int max; e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); max = e->ec + diff; p = root->rb_node; while (p) { struct ubi_wl_entry *e1; e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e1->ec >= max) { if (pick_max) e = e1; p = p->rb_left; } else { p = p->rb_right; e = e1; } } return e; } /** * find_mean_wl_entry - find wear-leveling entry with medium erase counter. * @ubi: UBI device description object * @root: the RB-tree where to look for * * This function looks for a wear leveling entry with medium erase counter, * but not greater or equivalent than the lowest erase counter plus * %WL_FREE_MAX_DIFF/2. */ static struct ubi_wl_entry *find_mean_wl_entry(struct ubi_device *ubi, struct rb_root *root) { struct ubi_wl_entry *e, *first, *last; first = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); last = rb_entry(rb_last(root), struct ubi_wl_entry, u.rb); if (last->ec - first->ec < WL_FREE_MAX_DIFF) { e = rb_entry(root->rb_node, struct ubi_wl_entry, u.rb); /* * If no fastmap has been written and fm_anchor is not * reserved and this WL entry can be used as anchor PEB * hold it back and return the second best WL entry such * that fastmap can use the anchor PEB later. */ e = may_reserve_for_fm(ubi, e, root); } else e = find_wl_entry(ubi, root, WL_FREE_MAX_DIFF/2, 0); return e; } /** * wl_get_wle - get a mean wl entry to be used by ubi_wl_get_peb() or * refill_wl_user_pool(). * @ubi: UBI device description object * * This function returns a wear leveling entry in case of success and * NULL in case of failure. */ static struct ubi_wl_entry *wl_get_wle(struct ubi_device *ubi) { struct ubi_wl_entry *e; e = find_mean_wl_entry(ubi, &ubi->free); if (!e) { ubi_err(ubi, "no free eraseblocks"); return NULL; } self_check_in_wl_tree(ubi, e, &ubi->free); /* * Move the physical eraseblock to the protection queue where it will * be protected from being moved for some time. */ rb_erase(&e->u.rb, &ubi->free); ubi->free_count--; dbg_wl("PEB %d EC %d", e->pnum, e->ec); return e; } /** * prot_queue_del - remove a physical eraseblock from the protection queue. * @ubi: UBI device description object * @pnum: the physical eraseblock to remove * * This function deletes PEB @pnum from the protection queue and returns zero * in case of success and %-ENODEV if the PEB was not found. */ static int prot_queue_del(struct ubi_device *ubi, int pnum) { struct ubi_wl_entry *e; e = ubi->lookuptbl[pnum]; if (!e) return -ENODEV; if (self_check_in_pq(ubi, e)) return -ENODEV; list_del(&e->u.list); dbg_wl("deleted PEB %d from the protection queue", e->pnum); return 0; } /** * ubi_sync_erase - synchronously erase a physical eraseblock. * @ubi: UBI device description object * @e: the physical eraseblock to erase * @torture: if the physical eraseblock has to be tortured * * This function returns zero in case of success and a negative error code in * case of failure. */ int ubi_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture) { int err; struct ubi_ec_hdr *ec_hdr; unsigned long long ec = e->ec; dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec); err = self_check_ec(ubi, e->pnum, e->ec); if (err) return -EINVAL; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); if (!ec_hdr) return -ENOMEM; err = ubi_io_sync_erase(ubi, e->pnum, torture); if (err < 0) goto out_free; ec += err; if (ec > UBI_MAX_ERASECOUNTER) { /* * Erase counter overflow. Upgrade UBI and use 64-bit * erase counters internally. */ ubi_err(ubi, "erase counter overflow at PEB %d, EC %llu", e->pnum, ec); err = -EINVAL; goto out_free; } dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec); ec_hdr->ec = cpu_to_be64(ec); err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr); if (err) goto out_free; e->ec = ec; spin_lock(&ubi->wl_lock); if (e->ec > ubi->max_ec) ubi->max_ec = e->ec; spin_unlock(&ubi->wl_lock); out_free: kfree(ec_hdr); return err; } /** * serve_prot_queue - check if it is time to stop protecting PEBs. * @ubi: UBI device description object * * This function is called after each erase operation and removes PEBs from the * tail of the protection queue. These PEBs have been protected for long enough * and should be moved to the used tree. */ static void serve_prot_queue(struct ubi_device *ubi) { struct ubi_wl_entry *e, *tmp; int count; /* * There may be several protected physical eraseblock to remove, * process them all. */ repeat: count = 0; spin_lock(&ubi->wl_lock); list_for_each_entry_safe(e, tmp, &ubi->pq[ubi->pq_head], u.list) { dbg_wl("PEB %d EC %d protection over, move to used tree", e->pnum, e->ec); list_del(&e->u.list); wl_tree_add(e, &ubi->used); if (count++ > 32) { /* * Let's be nice and avoid holding the spinlock for * too long. */ spin_unlock(&ubi->wl_lock); cond_resched(); goto repeat; } } ubi->pq_head += 1; if (ubi->pq_head == UBI_PROT_QUEUE_LEN) ubi->pq_head = 0; ubi_assert(ubi->pq_head >= 0 && ubi->pq_head < UBI_PROT_QUEUE_LEN); spin_unlock(&ubi->wl_lock); } /** * __schedule_ubi_work - schedule a work. * @ubi: UBI device description object * @wrk: the work to schedule * * This function adds a work defined by @wrk to the tail of the pending works * list. Can only be used if ubi->work_sem is already held in read mode! */ static void __schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) { spin_lock(&ubi->wl_lock); list_add_tail(&wrk->list, &ubi->works); ubi_assert(ubi->works_count >= 0); ubi->works_count += 1; if (ubi->thread_enabled && !ubi_dbg_is_bgt_disabled(ubi)) wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); } /** * schedule_ubi_work - schedule a work. * @ubi: UBI device description object * @wrk: the work to schedule * * This function adds a work defined by @wrk to the tail of the pending works * list. */ static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) { down_read(&ubi->work_sem); __schedule_ubi_work(ubi, wrk); up_read(&ubi->work_sem); } static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, int shutdown); /** * schedule_erase - schedule an erase work. * @ubi: UBI device description object * @e: the WL entry of the physical eraseblock to erase * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured * @nested: denotes whether the work_sem is already held * * This function returns zero in case of success and a %-ENOMEM in case of * failure. */ static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int vol_id, int lnum, int torture, bool nested) { struct ubi_work *wl_wrk; ubi_assert(e); dbg_wl("schedule erasure of PEB %d, EC %d, torture %d", e->pnum, e->ec, torture); wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS); if (!wl_wrk) return -ENOMEM; wl_wrk->func = &erase_worker; wl_wrk->e = e; wl_wrk->vol_id = vol_id; wl_wrk->lnum = lnum; wl_wrk->torture = torture; if (nested) __schedule_ubi_work(ubi, wl_wrk); else schedule_ubi_work(ubi, wl_wrk); return 0; } static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk); /** * do_sync_erase - run the erase worker synchronously. * @ubi: UBI device description object * @e: the WL entry of the physical eraseblock to erase * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured * */ static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int vol_id, int lnum, int torture) { struct ubi_work wl_wrk; dbg_wl("sync erase of PEB %i", e->pnum); wl_wrk.e = e; wl_wrk.vol_id = vol_id; wl_wrk.lnum = lnum; wl_wrk.torture = torture; return __erase_worker(ubi, &wl_wrk); } static int ensure_wear_leveling(struct ubi_device *ubi, int nested); /** * wear_leveling_worker - wear-leveling worker function. * @ubi: UBI device description object * @wrk: the work object * @shutdown: non-zero if the worker has to free memory and exit * because the WL-subsystem is shutting down * * This function copies a more worn out physical eraseblock to a less worn out * one. Returns zero in case of success and a negative error code in case of * failure. */ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, int shutdown) { int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0; int erase = 0, keep = 0, vol_id = -1, lnum = -1; struct ubi_wl_entry *e1, *e2; struct ubi_vid_io_buf *vidb; struct ubi_vid_hdr *vid_hdr; int dst_leb_clean = 0; kfree(wrk); if (shutdown) return 0; vidb = ubi_alloc_vid_buf(ubi, GFP_NOFS); if (!vidb) return -ENOMEM; vid_hdr = ubi_get_vid_hdr(vidb); down_read(&ubi->fm_eba_sem); mutex_lock(&ubi->move_mutex); spin_lock(&ubi->wl_lock); ubi_assert(!ubi->move_from && !ubi->move_to); ubi_assert(!ubi->move_to_put); #ifdef CONFIG_MTD_UBI_FASTMAP if (!next_peb_for_wl(ubi, true) || #else if (!ubi->free.rb_node || #endif (!ubi->used.rb_node && !ubi->scrub.rb_node)) { /* * No free physical eraseblocks? Well, they must be waiting in * the queue to be erased. Cancel movement - it will be * triggered again when a free physical eraseblock appears. * * No used physical eraseblocks? They must be temporarily * protected from being moved. They will be moved to the * @ubi->used tree later and the wear-leveling will be * triggered again. */ dbg_wl("cancel WL, a list is empty: free %d, used %d", !ubi->free.rb_node, !ubi->used.rb_node); goto out_cancel; } #ifdef CONFIG_MTD_UBI_FASTMAP e1 = find_anchor_wl_entry(&ubi->used); if (e1 && ubi->fm_anchor && (ubi->fm_anchor->ec - e1->ec >= UBI_WL_THRESHOLD)) { ubi->fm_do_produce_anchor = 1; /* * fm_anchor is no longer considered a good anchor. * NULL assignment also prevents multiple wear level checks * of this PEB. */ wl_tree_add(ubi->fm_anchor, &ubi->free); ubi->fm_anchor = NULL; ubi->free_count++; } if (ubi->fm_do_produce_anchor) { if (!e1) goto out_cancel; e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; self_check_in_wl_tree(ubi, e1, &ubi->used); rb_erase(&e1->u.rb, &ubi->used); dbg_wl("anchor-move PEB %d to PEB %d", e1->pnum, e2->pnum); ubi->fm_do_produce_anchor = 0; } else if (!ubi->scrub.rb_node) { #else if (!ubi->scrub.rb_node) { #endif /* * Now pick the least worn-out used physical eraseblock and a * highly worn-out free physical eraseblock. If the erase * counters differ much enough, start wear-leveling. */ e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) { dbg_wl("no WL needed: min used EC %d, max free EC %d", e1->ec, e2->ec); /* Give the unused PEB back */ wl_tree_add(e2, &ubi->free); ubi->free_count++; goto out_cancel; } self_check_in_wl_tree(ubi, e1, &ubi->used); rb_erase(&e1->u.rb, &ubi->used); dbg_wl("move PEB %d EC %d to PEB %d EC %d", e1->pnum, e1->ec, e2->pnum, e2->ec); } else { /* Perform scrubbing */ scrubbing = 1; e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb); e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; self_check_in_wl_tree(ubi, e1, &ubi->scrub); rb_erase(&e1->u.rb, &ubi->scrub); dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum); } ubi->move_from = e1; ubi->move_to = e2; spin_unlock(&ubi->wl_lock); /* * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum. * We so far do not know which logical eraseblock our physical * eraseblock (@e1) belongs to. We have to read the volume identifier * header first. * * Note, we are protected from this PEB being unmapped and erased. The * 'ubi_wl_put_peb()' would wait for moving to be finished if the PEB * which is being moved was unmapped. */ err = ubi_io_read_vid_hdr(ubi, e1->pnum, vidb, 0); if (err && err != UBI_IO_BITFLIPS) { dst_leb_clean = 1; if (err == UBI_IO_FF) { /* * We are trying to move PEB without a VID header. UBI * always write VID headers shortly after the PEB was * given, so we have a situation when it has not yet * had a chance to write it, because it was preempted. * So add this PEB to the protection queue so far, * because presumably more data will be written there * (including the missing VID header), and then we'll * move it. */ dbg_wl("PEB %d has no VID header", e1->pnum); protect = 1; goto out_not_moved; } else if (err == UBI_IO_FF_BITFLIPS) { /* * The same situation as %UBI_IO_FF, but bit-flips were * detected. It is better to schedule this PEB for * scrubbing. */ dbg_wl("PEB %d has no VID header but has bit-flips", e1->pnum); scrubbing = 1; goto out_not_moved; } else if (ubi->fast_attach && err == UBI_IO_BAD_HDR_EBADMSG) { /* * While a full scan would detect interrupted erasures * at attach time we can face them here when attached from * Fastmap. */ dbg_wl("PEB %d has ECC errors, maybe from an interrupted erasure", e1->pnum); erase = 1; goto out_not_moved; } ubi_err(ubi, "error %d while reading VID header from PEB %d", err, e1->pnum); goto out_error; } vol_id = be32_to_cpu(vid_hdr->vol_id); lnum = be32_to_cpu(vid_hdr->lnum); err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vidb); if (err) { if (err == MOVE_CANCEL_RACE) { /* * The LEB has not been moved because the volume is * being deleted or the PEB has been put meanwhile. We * should prevent this PEB from being selected for * wear-leveling movement again, so put it to the * protection queue. */ protect = 1; dst_leb_clean = 1; goto out_not_moved; } if (err == MOVE_RETRY) { /* * For source PEB: * 1. The scrubbing is set for scrub type PEB, it will * be put back into ubi->scrub list. * 2. Non-scrub type PEB will be put back into ubi->used * list. */ keep = 1; dst_leb_clean = 1; goto out_not_moved; } if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR || err == MOVE_TARGET_RD_ERR) { /* * Target PEB had bit-flips or write error - torture it. */ torture = 1; keep = 1; goto out_not_moved; } if (err == MOVE_SOURCE_RD_ERR) { /* * An error happened while reading the source PEB. Do * not switch to R/O mode in this case, and give the * upper layers a possibility to recover from this, * e.g. by unmapping corresponding LEB. Instead, just * put this PEB to the @ubi->erroneous list to prevent * UBI from trying to move it over and over again. */ if (ubi->erroneous_peb_count > ubi->max_erroneous) { ubi_err(ubi, "too many erroneous eraseblocks (%d)", ubi->erroneous_peb_count); goto out_error; } dst_leb_clean = 1; erroneous = 1; goto out_not_moved; } if (err < 0) goto out_error; ubi_assert(0); } /* The PEB has been successfully moved */ if (scrubbing) ubi_msg(ubi, "scrubbed PEB %d (LEB %d:%d), data moved to PEB %d", e1->pnum, vol_id, lnum, e2->pnum); ubi_free_vid_buf(vidb); spin_lock(&ubi->wl_lock); if (!ubi->move_to_put) { wl_tree_add(e2, &ubi->used); e2 = NULL; } ubi->move_from = ubi->move_to = NULL; ubi->move_to_put = ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); err = do_sync_erase(ubi, e1, vol_id, lnum, 0); if (err) { if (e2) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e2); spin_unlock(&ubi->wl_lock); } goto out_ro; } if (e2) { /* * Well, the target PEB was put meanwhile, schedule it for * erasure. */ dbg_wl("PEB %d (LEB %d:%d) was put meanwhile, erase", e2->pnum, vol_id, lnum); err = do_sync_erase(ubi, e2, vol_id, lnum, 0); if (err) goto out_ro; } dbg_wl("done"); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); return 0; /* * For some reasons the LEB was not moved, might be an error, might be * something else. @e1 was not changed, so return it back. @e2 might * have been changed, schedule it for erasure. */ out_not_moved: if (vol_id != -1) dbg_wl("cancel moving PEB %d (LEB %d:%d) to PEB %d (%d)", e1->pnum, vol_id, lnum, e2->pnum, err); else dbg_wl("cancel moving PEB %d to PEB %d (%d)", e1->pnum, e2->pnum, err); spin_lock(&ubi->wl_lock); if (protect) prot_queue_add(ubi, e1); else if (erroneous) { wl_tree_add(e1, &ubi->erroneous); ubi->erroneous_peb_count += 1; } else if (scrubbing) wl_tree_add(e1, &ubi->scrub); else if (keep) wl_tree_add(e1, &ubi->used); if (dst_leb_clean) { wl_tree_add(e2, &ubi->free); ubi->free_count++; } ubi_assert(!ubi->move_to_put); ubi->move_from = ubi->move_to = NULL; ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); ubi_free_vid_buf(vidb); if (dst_leb_clean) { ensure_wear_leveling(ubi, 1); } else { err = do_sync_erase(ubi, e2, vol_id, lnum, torture); if (err) goto out_ro; } if (erase) { err = do_sync_erase(ubi, e1, vol_id, lnum, 1); if (err) goto out_ro; } mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); return 0; out_error: if (vol_id != -1) ubi_err(ubi, "error %d while moving PEB %d to PEB %d", err, e1->pnum, e2->pnum); else ubi_err(ubi, "error %d while moving PEB %d (LEB %d:%d) to PEB %d", err, e1->pnum, vol_id, lnum, e2->pnum); spin_lock(&ubi->wl_lock); ubi->move_from = ubi->move_to = NULL; ubi->move_to_put = ubi->wl_scheduled = 0; wl_entry_destroy(ubi, e1); wl_entry_destroy(ubi, e2); spin_unlock(&ubi->wl_lock); ubi_free_vid_buf(vidb); out_ro: ubi_ro_mode(ubi); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); ubi_assert(err != 0); return err < 0 ? err : -EIO; out_cancel: ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); ubi_free_vid_buf(vidb); return 0; } /** * ensure_wear_leveling - schedule wear-leveling if it is needed. * @ubi: UBI device description object * @nested: set to non-zero if this function is called from UBI worker * * This function checks if it is time to start wear-leveling and schedules it * if yes. This function returns zero in case of success and a negative error * code in case of failure. */ static int ensure_wear_leveling(struct ubi_device *ubi, int nested) { int err = 0; struct ubi_work *wrk; spin_lock(&ubi->wl_lock); if (ubi->wl_scheduled) /* Wear-leveling is already in the work queue */ goto out_unlock; /* * If the ubi->scrub tree is not empty, scrubbing is needed, and the * WL worker has to be scheduled anyway. */ if (!ubi->scrub.rb_node) { #ifdef CONFIG_MTD_UBI_FASTMAP if (!need_wear_leveling(ubi)) goto out_unlock; #else struct ubi_wl_entry *e1; struct ubi_wl_entry *e2; if (!ubi->used.rb_node || !ubi->free.rb_node) /* No physical eraseblocks - no deal */ goto out_unlock; /* * We schedule wear-leveling only if the difference between the * lowest erase counter of used physical eraseblocks and a high * erase counter of free physical eraseblocks is greater than * %UBI_WL_THRESHOLD. */ e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF, 0); if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) goto out_unlock; #endif dbg_wl("schedule wear-leveling"); } else dbg_wl("schedule scrubbing"); ubi->wl_scheduled = 1; spin_unlock(&ubi->wl_lock); wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS); if (!wrk) { err = -ENOMEM; goto out_cancel; } wrk->func = &wear_leveling_worker; if (nested) __schedule_ubi_work(ubi, wrk); else schedule_ubi_work(ubi, wrk); return err; out_cancel: spin_lock(&ubi->wl_lock); ubi->wl_scheduled = 0; out_unlock: spin_unlock(&ubi->wl_lock); return err; } /** * __erase_worker - physical eraseblock erase worker function. * @ubi: UBI device description object * @wl_wrk: the work object * * This function erases a physical eraseblock and perform torture testing if * needed. It also takes care about marking the physical eraseblock bad if * needed. Returns zero in case of success and a negative error code in case of * failure. */ static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk) { struct ubi_wl_entry *e = wl_wrk->e; int pnum = e->pnum; int vol_id = wl_wrk->vol_id; int lnum = wl_wrk->lnum; int err, available_consumed = 0; dbg_wl("erase PEB %d EC %d LEB %d:%d", pnum, e->ec, wl_wrk->vol_id, wl_wrk->lnum); err = ubi_sync_erase(ubi, e, wl_wrk->torture); if (!err) { spin_lock(&ubi->wl_lock); if (!ubi->fm_disabled && !ubi->fm_anchor && e->pnum < UBI_FM_MAX_START) { /* * Abort anchor production, if needed it will be * enabled again in the wear leveling started below. */ ubi->fm_anchor = e; ubi->fm_do_produce_anchor = 0; } else { wl_tree_add(e, &ubi->free); ubi->free_count++; } spin_unlock(&ubi->wl_lock); /* * One more erase operation has happened, take care about * protected physical eraseblocks. */ serve_prot_queue(ubi); /* And take care about wear-leveling */ err = ensure_wear_leveling(ubi, 1); return err; } ubi_err(ubi, "failed to erase PEB %d, error %d", pnum, err); if (err == -EINTR || err == -ENOMEM || err == -EAGAIN || err == -EBUSY) { int err1; /* Re-schedule the LEB for erasure */ err1 = schedule_erase(ubi, e, vol_id, lnum, 0, true); if (err1) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); spin_unlock(&ubi->wl_lock); err = err1; goto out_ro; } return err; } spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); spin_unlock(&ubi->wl_lock); if (err != -EIO) /* * If this is not %-EIO, we have no idea what to do. Scheduling * this physical eraseblock for erasure again would cause * errors again and again. Well, lets switch to R/O mode. */ goto out_ro; /* It is %-EIO, the PEB went bad */ if (!ubi->bad_allowed) { ubi_err(ubi, "bad physical eraseblock %d detected", pnum); goto out_ro; } spin_lock(&ubi->volumes_lock); if (ubi->beb_rsvd_pebs == 0) { if (ubi->avail_pebs == 0) { spin_unlock(&ubi->volumes_lock); ubi_err(ubi, "no reserved/available physical eraseblocks"); goto out_ro; } ubi->avail_pebs -= 1; available_consumed = 1; } spin_unlock(&ubi->volumes_lock); ubi_msg(ubi, "mark PEB %d as bad", pnum); err = ubi_io_mark_bad(ubi, pnum); if (err) goto out_ro; spin_lock(&ubi->volumes_lock); if (ubi->beb_rsvd_pebs > 0) { if (available_consumed) { /* * The amount of reserved PEBs increased since we last * checked. */ ubi->avail_pebs += 1; available_consumed = 0; } ubi->beb_rsvd_pebs -= 1; } ubi->bad_peb_count += 1; ubi->good_peb_count -= 1; ubi_calculate_reserved(ubi); if (available_consumed) ubi_warn(ubi, "no PEBs in the reserved pool, used an available PEB"); else if (ubi->beb_rsvd_pebs) ubi_msg(ubi, "%d PEBs left in the reserve", ubi->beb_rsvd_pebs); else ubi_warn(ubi, "last PEB from the reserve was used"); spin_unlock(&ubi->volumes_lock); return err; out_ro: if (available_consumed) { spin_lock(&ubi->volumes_lock); ubi->avail_pebs += 1; spin_unlock(&ubi->volumes_lock); } ubi_ro_mode(ubi); return err; } static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, int shutdown) { int ret; if (shutdown) { struct ubi_wl_entry *e = wl_wrk->e; dbg_wl("cancel erasure of PEB %d EC %d", e->pnum, e->ec); kfree(wl_wrk); wl_entry_destroy(ubi, e); return 0; } ret = __erase_worker(ubi, wl_wrk); kfree(wl_wrk); return ret; } /** * ubi_wl_put_peb - return a PEB to the wear-leveling sub-system. * @ubi: UBI device description object * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @pnum: physical eraseblock to return * @torture: if this physical eraseblock has to be tortured * * This function is called to return physical eraseblock @pnum to the pool of * free physical eraseblocks. The @torture flag has to be set if an I/O error * occurred to this @pnum and it has to be tested. This function returns zero * in case of success, and a negative error code in case of failure. */ int ubi_wl_put_peb(struct ubi_device *ubi, int vol_id, int lnum, int pnum, int torture) { int err; struct ubi_wl_entry *e; dbg_wl("PEB %d", pnum); ubi_assert(pnum >= 0); ubi_assert(pnum < ubi->peb_count); down_read(&ubi->fm_protect); retry: spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (!e) { /* * This wl entry has been removed for some errors by other * process (eg. wear leveling worker), corresponding process * (except __erase_worker, which cannot concurrent with * ubi_wl_put_peb) will set ubi ro_mode at the same time, * just ignore this wl entry. */ spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return 0; } if (e == ubi->move_from) { /* * User is putting the physical eraseblock which was selected to * be moved. It will be scheduled for erasure in the * wear-leveling worker. */ dbg_wl("PEB %d is being moved, wait", pnum); spin_unlock(&ubi->wl_lock); /* Wait for the WL worker by taking the @ubi->move_mutex */ mutex_lock(&ubi->move_mutex); mutex_unlock(&ubi->move_mutex); goto retry; } else if (e == ubi->move_to) { /* * User is putting the physical eraseblock which was selected * as the target the data is moved to. It may happen if the EBA * sub-system already re-mapped the LEB in 'ubi_eba_copy_leb()' * but the WL sub-system has not put the PEB to the "used" tree * yet, but it is about to do this. So we just set a flag which * will tell the WL worker that the PEB is not needed anymore * and should be scheduled for erasure. */ dbg_wl("PEB %d is the target of data moving", pnum); ubi_assert(!ubi->move_to_put); ubi->move_to_put = 1; spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return 0; } else { if (in_wl_tree(e, &ubi->used)) { self_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else if (in_wl_tree(e, &ubi->scrub)) { self_check_in_wl_tree(ubi, e, &ubi->scrub); rb_erase(&e->u.rb, &ubi->scrub); } else if (in_wl_tree(e, &ubi->erroneous)) { self_check_in_wl_tree(ubi, e, &ubi->erroneous); rb_erase(&e->u.rb, &ubi->erroneous); ubi->erroneous_peb_count -= 1; ubi_assert(ubi->erroneous_peb_count >= 0); /* Erroneous PEBs should be tortured */ torture = 1; } else { err = prot_queue_del(ubi, e->pnum); if (err) { ubi_err(ubi, "PEB %d not found", pnum); ubi_ro_mode(ubi); spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return err; } } } spin_unlock(&ubi->wl_lock); err = schedule_erase(ubi, e, vol_id, lnum, torture, false); if (err) { spin_lock(&ubi->wl_lock); wl_tree_add(e, &ubi->used); spin_unlock(&ubi->wl_lock); } up_read(&ubi->fm_protect); return err; } /** * ubi_wl_scrub_peb - schedule a physical eraseblock for scrubbing. * @ubi: UBI device description object * @pnum: the physical eraseblock to schedule * * If a bit-flip in a physical eraseblock is detected, this physical eraseblock * needs scrubbing. This function schedules a physical eraseblock for * scrubbing which is done in background. This function returns zero in case of * success and a negative error code in case of failure. */ int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum) { struct ubi_wl_entry *e; ubi_msg(ubi, "schedule PEB %d for scrubbing", pnum); retry: spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (e == ubi->move_from || in_wl_tree(e, &ubi->scrub) || in_wl_tree(e, &ubi->erroneous)) { spin_unlock(&ubi->wl_lock); return 0; } if (e == ubi->move_to) { /* * This physical eraseblock was used to move data to. The data * was moved but the PEB was not yet inserted to the proper * tree. We should just wait a little and let the WL worker * proceed. */ spin_unlock(&ubi->wl_lock); dbg_wl("the PEB %d is not in proper tree, retry", pnum); yield(); goto retry; } if (in_wl_tree(e, &ubi->used)) { self_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else { int err; err = prot_queue_del(ubi, e->pnum); if (err) { ubi_err(ubi, "PEB %d not found", pnum); ubi_ro_mode(ubi); spin_unlock(&ubi->wl_lock); return err; } } wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); /* * Technically scrubbing is the same as wear-leveling, so it is done * by the WL worker. */ return ensure_wear_leveling(ubi, 0); } /** * ubi_wl_flush - flush all pending works. * @ubi: UBI device description object * @vol_id: the volume id to flush for * @lnum: the logical eraseblock number to flush for * * This function executes all pending works for a particular volume id / * logical eraseblock number pair. If either value is set to %UBI_ALL, then it * acts as a wildcard for all of the corresponding volume numbers or logical * eraseblock numbers. It returns zero in case of success and a negative error * code in case of failure. */ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) { int err = 0; int found = 1; /* * Erase while the pending works queue is not empty, but not more than * the number of currently pending works. */ dbg_wl("flush pending work for LEB %d:%d (%d pending works)", vol_id, lnum, ubi->works_count); while (found) { struct ubi_work *wrk, *tmp; found = 0; down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); list_for_each_entry_safe(wrk, tmp, &ubi->works, list) { if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) && (lnum == UBI_ALL || wrk->lnum == lnum)) { list_del(&wrk->list); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); spin_unlock(&ubi->wl_lock); err = wrk->func(ubi, wrk, 0); if (err) { up_read(&ubi->work_sem); return err; } spin_lock(&ubi->wl_lock); found = 1; break; } } spin_unlock(&ubi->wl_lock); up_read(&ubi->work_sem); } /* * Make sure all the works which have been done in parallel are * finished. */ down_write(&ubi->work_sem); up_write(&ubi->work_sem); return err; } static bool scrub_possible(struct ubi_device *ubi, struct ubi_wl_entry *e) { if (in_wl_tree(e, &ubi->scrub)) return false; else if (in_wl_tree(e, &ubi->erroneous)) return false; else if (ubi->move_from == e) return false; else if (ubi->move_to == e) return false; return true; } /** * ubi_bitflip_check - Check an eraseblock for bitflips and scrub it if needed. * @ubi: UBI device description object * @pnum: the physical eraseblock to schedule * @force: don't read the block, assume bitflips happened and take action. * * This function reads the given eraseblock and checks if bitflips occured. * In case of bitflips, the eraseblock is scheduled for scrubbing. * If scrubbing is forced with @force, the eraseblock is not read, * but scheduled for scrubbing right away. * * Returns: * %EINVAL, PEB is out of range * %ENOENT, PEB is no longer used by UBI * %EBUSY, PEB cannot be checked now or a check is currently running on it * %EAGAIN, bit flips happened but scrubbing is currently not possible * %EUCLEAN, bit flips happened and PEB is scheduled for scrubbing * %0, no bit flips detected */ int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force) { int err = 0; struct ubi_wl_entry *e; if (pnum < 0 || pnum >= ubi->peb_count) { err = -EINVAL; goto out; } /* * Pause all parallel work, otherwise it can happen that the * erase worker frees a wl entry under us. */ down_write(&ubi->work_sem); /* * Make sure that the wl entry does not change state while * inspecting it. */ spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (!e) { spin_unlock(&ubi->wl_lock); err = -ENOENT; goto out_resume; } /* * Does it make sense to check this PEB? */ if (!scrub_possible(ubi, e)) { spin_unlock(&ubi->wl_lock); err = -EBUSY; goto out_resume; } spin_unlock(&ubi->wl_lock); if (!force) { mutex_lock(&ubi->buf_mutex); err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size); mutex_unlock(&ubi->buf_mutex); } if (force || err == UBI_IO_BITFLIPS) { /* * Okay, bit flip happened, let's figure out what we can do. */ spin_lock(&ubi->wl_lock); /* * Recheck. We released wl_lock, UBI might have killed the * wl entry under us. */ e = ubi->lookuptbl[pnum]; if (!e) { spin_unlock(&ubi->wl_lock); err = -ENOENT; goto out_resume; } /* * Need to re-check state */ if (!scrub_possible(ubi, e)) { spin_unlock(&ubi->wl_lock); err = -EBUSY; goto out_resume; } if (in_pq(ubi, e)) { prot_queue_del(ubi, e->pnum); wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); err = ensure_wear_leveling(ubi, 1); } else if (in_wl_tree(e, &ubi->used)) { rb_erase(&e->u.rb, &ubi->used); wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); err = ensure_wear_leveling(ubi, 1); } else if (in_wl_tree(e, &ubi->free)) { rb_erase(&e->u.rb, &ubi->free); ubi->free_count--; spin_unlock(&ubi->wl_lock); /* * This PEB is empty we can schedule it for * erasure right away. No wear leveling needed. */ err = schedule_erase(ubi, e, UBI_UNKNOWN, UBI_UNKNOWN, force ? 0 : 1, true); } else { spin_unlock(&ubi->wl_lock); err = -EAGAIN; } if (!err && !force) err = -EUCLEAN; } else { err = 0; } out_resume: up_write(&ubi->work_sem); out: return err; } /** * tree_destroy - destroy an RB-tree. * @ubi: UBI device description object * @root: the root of the tree to destroy */ static void tree_destroy(struct ubi_device *ubi, struct rb_root *root) { struct rb_node *rb; struct ubi_wl_entry *e; rb = root->rb_node; while (rb) { if (rb->rb_left) rb = rb->rb_left; else if (rb->rb_right) rb = rb->rb_right; else { e = rb_entry(rb, struct ubi_wl_entry, u.rb); rb = rb_parent(rb); if (rb) { if (rb->rb_left == &e->u.rb) rb->rb_left = NULL; else rb->rb_right = NULL; } wl_entry_destroy(ubi, e); } } } /** * ubi_thread - UBI background thread. * @u: the UBI device description object pointer */ int ubi_thread(void *u) { int failures = 0; struct ubi_device *ubi = u; ubi_msg(ubi, "background thread \"%s\" started, PID %d", ubi->bgt_name, task_pid_nr(current)); set_freezable(); for (;;) { int err; if (kthread_should_stop()) break; if (try_to_freeze()) continue; spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works) || ubi->ro_mode || !ubi->thread_enabled || ubi_dbg_is_bgt_disabled(ubi)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&ubi->wl_lock); /* * Check kthread_should_stop() after we set the task * state to guarantee that we either see the stop bit * and exit or the task state is reset to runnable such * that it's not scheduled out indefinitely and detects * the stop bit at kthread_should_stop(). */ if (kthread_should_stop()) { set_current_state(TASK_RUNNING); break; } schedule(); continue; } spin_unlock(&ubi->wl_lock); err = do_work(ubi, NULL); if (err) { ubi_err(ubi, "%s: work failed with error code %d", ubi->bgt_name, err); if (failures++ > WL_MAX_FAILURES) { /* * Too many failures, disable the thread and * switch to read-only mode. */ ubi_msg(ubi, "%s: %d consecutive failures", ubi->bgt_name, WL_MAX_FAILURES); ubi_ro_mode(ubi); ubi->thread_enabled = 0; continue; } } else failures = 0; cond_resched(); } dbg_wl("background thread \"%s\" is killed", ubi->bgt_name); ubi->thread_enabled = 0; return 0; } /** * shutdown_work - shutdown all pending works. * @ubi: UBI device description object */ static void shutdown_work(struct ubi_device *ubi) { while (!list_empty(&ubi->works)) { struct ubi_work *wrk; wrk = list_entry(ubi->works.next, struct ubi_work, list); list_del(&wrk->list); wrk->func(ubi, wrk, 1); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); } } /** * erase_aeb - erase a PEB given in UBI attach info PEB * @ubi: UBI device description object * @aeb: UBI attach info PEB * @sync: If true, erase synchronously. Otherwise schedule for erasure */ static int erase_aeb(struct ubi_device *ubi, struct ubi_ainf_peb *aeb, bool sync) { struct ubi_wl_entry *e; int err; e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) return -ENOMEM; e->pnum = aeb->pnum; e->ec = aeb->ec; ubi->lookuptbl[e->pnum] = e; if (sync) { err = ubi_sync_erase(ubi, e, false); if (err) goto out_free; wl_tree_add(e, &ubi->free); ubi->free_count++; } else { err = schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0, false); if (err) goto out_free; } return 0; out_free: wl_entry_destroy(ubi, e); return err; } /** * ubi_wl_init - initialize the WL sub-system using attaching information. * @ubi: UBI device description object * @ai: attaching information * * This function returns zero in case of success, and a negative error code in * case of failure. */ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) { int err, i, reserved_pebs, found_pebs = 0; struct rb_node *rb1, *rb2; struct ubi_ainf_volume *av; struct ubi_ainf_peb *aeb, *tmp; struct ubi_wl_entry *e; ubi->used = ubi->erroneous = ubi->free = ubi->scrub = RB_ROOT; spin_lock_init(&ubi->wl_lock); mutex_init(&ubi->move_mutex); init_rwsem(&ubi->work_sem); ubi->max_ec = ai->max_ec; INIT_LIST_HEAD(&ubi->works); sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num); err = -ENOMEM; ubi->lookuptbl = kcalloc(ubi->peb_count, sizeof(void *), GFP_KERNEL); if (!ubi->lookuptbl) return err; for (i = 0; i < UBI_PROT_QUEUE_LEN; i++) INIT_LIST_HEAD(&ubi->pq[i]); ubi->pq_head = 0; ubi->free_count = 0; list_for_each_entry_safe(aeb, tmp, &ai->erase, u.list) { cond_resched(); err = erase_aeb(ubi, aeb, false); if (err) goto out_free; found_pebs++; } list_for_each_entry(aeb, &ai->free, u.list) { cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) { err = -ENOMEM; goto out_free; } e->pnum = aeb->pnum; e->ec = aeb->ec; ubi_assert(e->ec >= 0); wl_tree_add(e, &ubi->free); ubi->free_count++; ubi->lookuptbl[e->pnum] = e; found_pebs++; } ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) { ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb) { cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) { err = -ENOMEM; goto out_free; } e->pnum = aeb->pnum; e->ec = aeb->ec; ubi->lookuptbl[e->pnum] = e; if (!aeb->scrub) { dbg_wl("add PEB %d EC %d to the used tree", e->pnum, e->ec); wl_tree_add(e, &ubi->used); } else { dbg_wl("add PEB %d EC %d to the scrub tree", e->pnum, e->ec); wl_tree_add(e, &ubi->scrub); } found_pebs++; } } list_for_each_entry(aeb, &ai->fastmap, u.list) { cond_resched(); e = ubi_find_fm_block(ubi, aeb->pnum); if (e) { ubi_assert(!ubi->lookuptbl[e->pnum]); ubi->lookuptbl[e->pnum] = e; } else { bool sync = false; /* * Usually old Fastmap PEBs are scheduled for erasure * and we don't have to care about them but if we face * an power cut before scheduling them we need to * take care of them here. */ if (ubi->lookuptbl[aeb->pnum]) continue; /* * The fastmap update code might not find a free PEB for * writing the fastmap anchor to and then reuses the * current fastmap anchor PEB. When this PEB gets erased * and a power cut happens before it is written again we * must make sure that the fastmap attach code doesn't * find any outdated fastmap anchors, hence we erase the * outdated fastmap anchor PEBs synchronously here. */ if (aeb->vol_id == UBI_FM_SB_VOLUME_ID) sync = true; err = erase_aeb(ubi, aeb, sync); if (err) goto out_free; } found_pebs++; } dbg_wl("found %i PEBs", found_pebs); ubi_assert(ubi->good_peb_count == found_pebs); reserved_pebs = WL_RESERVED_PEBS; ubi_fastmap_init(ubi, &reserved_pebs); if (ubi->avail_pebs < reserved_pebs) { ubi_err(ubi, "no enough physical eraseblocks (%d, need %d)", ubi->avail_pebs, reserved_pebs); if (ubi->corr_peb_count) ubi_err(ubi, "%d PEBs are corrupted and not used", ubi->corr_peb_count); err = -ENOSPC; goto out_free; } ubi->avail_pebs -= reserved_pebs; ubi->rsvd_pebs += reserved_pebs; /* Schedule wear-leveling if needed */ err = ensure_wear_leveling(ubi, 0); if (err) goto out_free; #ifdef CONFIG_MTD_UBI_FASTMAP if (!ubi->ro_mode && !ubi->fm_disabled) ubi_ensure_anchor_pebs(ubi); #endif return 0; out_free: shutdown_work(ubi); tree_destroy(ubi, &ubi->used); tree_destroy(ubi, &ubi->free); tree_destroy(ubi, &ubi->scrub); kfree(ubi->lookuptbl); return err; } /** * protection_queue_destroy - destroy the protection queue. * @ubi: UBI device description object */ static void protection_queue_destroy(struct ubi_device *ubi) { int i; struct ubi_wl_entry *e, *tmp; for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) { list_for_each_entry_safe(e, tmp, &ubi->pq[i], u.list) { list_del(&e->u.list); wl_entry_destroy(ubi, e); } } } /** * ubi_wl_close - close the wear-leveling sub-system. * @ubi: UBI device description object */ void ubi_wl_close(struct ubi_device *ubi) { dbg_wl("close the WL sub-system"); ubi_fastmap_close(ubi); shutdown_work(ubi); protection_queue_destroy(ubi); tree_destroy(ubi, &ubi->used); tree_destroy(ubi, &ubi->erroneous); tree_destroy(ubi, &ubi->free); tree_destroy(ubi, &ubi->scrub); kfree(ubi->lookuptbl); } /** * self_check_ec - make sure that the erase counter of a PEB is correct. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * @ec: the erase counter to check * * This function returns zero if the erase counter of physical eraseblock @pnum * is equivalent to @ec, and a negative error code if not or if an error * occurred. */ static int self_check_ec(struct ubi_device *ubi, int pnum, int ec) { int err; long long read_ec; struct ubi_ec_hdr *ec_hdr; if (!ubi_dbg_chk_gen(ubi)) return 0; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); if (!ec_hdr) return -ENOMEM; err = ubi_io_read_ec_hdr(ubi, pnum, ec_hdr, 0); if (err && err != UBI_IO_BITFLIPS) { /* The header does not have to exist */ err = 0; goto out_free; } read_ec = be64_to_cpu(ec_hdr->ec); if (ec != read_ec && read_ec - ec > 1) { ubi_err(ubi, "self-check failed for PEB %d", pnum); ubi_err(ubi, "read EC is %lld, should be %d", read_ec, ec); dump_stack(); err = 1; } else err = 0; out_free: kfree(ec_hdr); return err; } /** * self_check_in_wl_tree - check that wear-leveling entry is in WL RB-tree. * @ubi: UBI device description object * @e: the wear-leveling entry to check * @root: the root of the tree * * This function returns zero if @e is in the @root RB-tree and %-EINVAL if it * is not. */ static int self_check_in_wl_tree(const struct ubi_device *ubi, struct ubi_wl_entry *e, struct rb_root *root) { if (!ubi_dbg_chk_gen(ubi)) return 0; if (in_wl_tree(e, root)) return 0; ubi_err(ubi, "self-check failed for PEB %d, EC %d, RB-tree %p ", e->pnum, e->ec, root); dump_stack(); return -EINVAL; } /** * self_check_in_pq - check if wear-leveling entry is in the protection * queue. * @ubi: UBI device description object * @e: the wear-leveling entry to check * * This function returns zero if @e is in @ubi->pq and %-EINVAL if it is not. */ static int self_check_in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e) { if (!ubi_dbg_chk_gen(ubi)) return 0; if (in_pq(ubi, e)) return 0; ubi_err(ubi, "self-check failed for PEB %d, EC %d, Protect queue", e->pnum, e->ec); dump_stack(); return -EINVAL; } #ifndef CONFIG_MTD_UBI_FASTMAP static struct ubi_wl_entry *get_peb_for_wl(struct ubi_device *ubi) { struct ubi_wl_entry *e; e = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF, 0); self_check_in_wl_tree(ubi, e, &ubi->free); ubi->free_count--; ubi_assert(ubi->free_count >= 0); rb_erase(&e->u.rb, &ubi->free); return e; } /** * produce_free_peb - produce a free physical eraseblock. * @ubi: UBI device description object * * This function tries to make a free PEB by means of synchronous execution of * pending works. This may be needed if, for example the background thread is * disabled. Returns zero in case of success and a negative error code in case * of failure. */ static int produce_free_peb(struct ubi_device *ubi) { int err; while (!ubi->free.rb_node && ubi->works_count) { spin_unlock(&ubi->wl_lock); dbg_wl("do one work synchronously"); err = do_work(ubi, NULL); spin_lock(&ubi->wl_lock); if (err) return err; } return 0; } /** * ubi_wl_get_peb - get a physical eraseblock. * @ubi: UBI device description object * * This function returns a physical eraseblock in case of success and a * negative error code in case of failure. * Returns with ubi->fm_eba_sem held in read mode! */ int ubi_wl_get_peb(struct ubi_device *ubi) { int err; struct ubi_wl_entry *e; retry: down_read(&ubi->fm_eba_sem); spin_lock(&ubi->wl_lock); if (!ubi->free.rb_node) { if (ubi->works_count == 0) { ubi_err(ubi, "no free eraseblocks"); ubi_assert(list_empty(&ubi->works)); spin_unlock(&ubi->wl_lock); return -ENOSPC; } err = produce_free_peb(ubi); if (err < 0) { spin_unlock(&ubi->wl_lock); return err; } spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_eba_sem); goto retry; } e = wl_get_wle(ubi); prot_queue_add(ubi, e); spin_unlock(&ubi->wl_lock); err = ubi_self_check_all_ff(ubi, e->pnum, ubi->vid_hdr_aloffset, ubi->peb_size - ubi->vid_hdr_aloffset); if (err) { ubi_err(ubi, "new PEB %d does not contain all 0xFF bytes", e->pnum); return err; } return e->pnum; } #else #include "fastmap-wl.c" #endif
2 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/befs/debug.c * * Copyright (C) 2001 Will Dyson (will_dyson at pobox.com) * * With help from the ntfs-tng driver by Anton Altparmakov * * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) * * debug functions */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #ifdef __KERNEL__ #include <linux/stdarg.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/slab.h> #endif /* __KERNEL__ */ #include "befs.h" void befs_error(const struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("(%s): %pV\n", sb->s_id, &vaf); va_end(args); } void befs_warning(const struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn("(%s): %pV\n", sb->s_id, &vaf); va_end(args); } void befs_debug(const struct super_block *sb, const char *fmt, ...) { #ifdef CONFIG_BEFS_DEBUG struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_debug("(%s): %pV\n", sb->s_id, &vaf); va_end(args); #endif //CONFIG_BEFS_DEBUG } void befs_dump_inode(const struct super_block *sb, befs_inode *inode) { #ifdef CONFIG_BEFS_DEBUG befs_block_run tmp_run; befs_debug(sb, "befs_inode information"); befs_debug(sb, " magic1 %08x", fs32_to_cpu(sb, inode->magic1)); tmp_run = fsrun_to_cpu(sb, inode->inode_num); befs_debug(sb, " inode_num %u, %hu, %hu", tmp_run.allocation_group, tmp_run.start, tmp_run.len); befs_debug(sb, " uid %u", fs32_to_cpu(sb, inode->uid)); befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); befs_debug(sb, " create_time %llu", fs64_to_cpu(sb, inode->create_time)); befs_debug(sb, " last_modified_time %llu", fs64_to_cpu(sb, inode->last_modified_time)); tmp_run = fsrun_to_cpu(sb, inode->parent); befs_debug(sb, " parent [%u, %hu, %hu]", tmp_run.allocation_group, tmp_run.start, tmp_run.len); tmp_run = fsrun_to_cpu(sb, inode->attributes); befs_debug(sb, " attributes [%u, %hu, %hu]", tmp_run.allocation_group, tmp_run.start, tmp_run.len); befs_debug(sb, " type %08x", fs32_to_cpu(sb, inode->type)); befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, inode->inode_size)); if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) { befs_debug(sb, " Symbolic link [%s]", inode->data.symlink); } else { int i; for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; i++) { tmp_run = fsrun_to_cpu(sb, inode->data.datastream.direct[i]); befs_debug(sb, " direct %d [%u, %hu, %hu]", i, tmp_run.allocation_group, tmp_run.start, tmp_run.len); } befs_debug(sb, " max_direct_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_direct_range)); tmp_run = fsrun_to_cpu(sb, inode->data.datastream.indirect); befs_debug(sb, " indirect [%u, %hu, %hu]", tmp_run.allocation_group, tmp_run.start, tmp_run.len); befs_debug(sb, " max_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_indirect_range)); tmp_run = fsrun_to_cpu(sb, inode->data.datastream.double_indirect); befs_debug(sb, " double indirect [%u, %hu, %hu]", tmp_run.allocation_group, tmp_run.start, tmp_run.len); befs_debug(sb, " max_double_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_double_indirect_range)); befs_debug(sb, " size %llu", fs64_to_cpu(sb, inode->data.datastream.size)); } #endif //CONFIG_BEFS_DEBUG } /* * Display super block structure for debug. */ void befs_dump_super_block(const struct super_block *sb, befs_super_block *sup) { #ifdef CONFIG_BEFS_DEBUG befs_block_run tmp_run; befs_debug(sb, "befs_super_block information"); befs_debug(sb, " name %s", sup->name); befs_debug(sb, " magic1 %08x", fs32_to_cpu(sb, sup->magic1)); befs_debug(sb, " fs_byte_order %08x", fs32_to_cpu(sb, sup->fs_byte_order)); befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, sup->inode_size)); befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); befs_debug(sb, " blocks_per_ag %u", fs32_to_cpu(sb, sup->blocks_per_ag)); befs_debug(sb, " ag_shift %u", fs32_to_cpu(sb, sup->ag_shift)); befs_debug(sb, " num_ags %u", fs32_to_cpu(sb, sup->num_ags)); befs_debug(sb, " flags %08x", fs32_to_cpu(sb, sup->flags)); tmp_run = fsrun_to_cpu(sb, sup->log_blocks); befs_debug(sb, " log_blocks %u, %hu, %hu", tmp_run.allocation_group, tmp_run.start, tmp_run.len); befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start)); befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end)); befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); tmp_run = fsrun_to_cpu(sb, sup->root_dir); befs_debug(sb, " root_dir %u, %hu, %hu", tmp_run.allocation_group, tmp_run.start, tmp_run.len); tmp_run = fsrun_to_cpu(sb, sup->indices); befs_debug(sb, " indices %u, %hu, %hu", tmp_run.allocation_group, tmp_run.start, tmp_run.len); #endif //CONFIG_BEFS_DEBUG } #if 0 /* unused */ void befs_dump_small_data(const struct super_block *sb, befs_small_data *sd) { } /* unused */ void befs_dump_run(const struct super_block *sb, befs_disk_block_run run) { #ifdef CONFIG_BEFS_DEBUG befs_block_run n = fsrun_to_cpu(sb, run); befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len); #endif //CONFIG_BEFS_DEBUG } #endif /* 0 */ void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *super) { #ifdef CONFIG_BEFS_DEBUG befs_debug(sb, "Btree super structure"); befs_debug(sb, " magic %08x", fs32_to_cpu(sb, super->magic)); befs_debug(sb, " node_size %u", fs32_to_cpu(sb, super->node_size)); befs_debug(sb, " max_depth %08x", fs32_to_cpu(sb, super->max_depth)); befs_debug(sb, " data_type %08x", fs32_to_cpu(sb, super->data_type)); befs_debug(sb, " root_node_pointer %016LX", fs64_to_cpu(sb, super->root_node_ptr)); befs_debug(sb, " free_node_pointer %016LX", fs64_to_cpu(sb, super->free_node_ptr)); befs_debug(sb, " maximum size %016LX", fs64_to_cpu(sb, super->max_size)); #endif //CONFIG_BEFS_DEBUG } void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *node) { #ifdef CONFIG_BEFS_DEBUG befs_debug(sb, "Btree node structure"); befs_debug(sb, " left %016LX", fs64_to_cpu(sb, node->left)); befs_debug(sb, " right %016LX", fs64_to_cpu(sb, node->right)); befs_debug(sb, " overflow %016LX", fs64_to_cpu(sb, node->overflow)); befs_debug(sb, " all_key_count %hu", fs16_to_cpu(sb, node->all_key_count)); befs_debug(sb, " all_key_length %hu", fs16_to_cpu(sb, node->all_key_length)); #endif //CONFIG_BEFS_DEBUG }
621 27 27 27 622 622 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos policy for assigning an I/O priority class to requests. * * Using an rq-qos policy for assigning I/O priority class has two advantages * over using the ioprio_set() system call: * * - This policy is cgroup based so it has all the advantages of cgroups. * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos * controller affects page cache writeback I/O for filesystems that support * assiociating a cgroup with writeback I/O. See also * Documentation/admin-guide/cgroup-v2.rst. */ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. */ struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; }; static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), struct ioprio_blkcg, cpd); } static struct ioprio_blkcg * ioprio_blkcg_from_css(struct cgroup_subsys_state *css) { return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); return 0; } static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); int ret; if (off != 0) return -EIO; /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ ret = sysfs_match_string(policy_name, buf); if (ret < 0) return ret; blkcg->prio_policy = ret; return nbytes; } static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; blkcg = kzalloc(sizeof(*blkcg), gfp); if (!blkcg) return NULL; blkcg->prio_policy = POLICY_NO_CHANGE; return &blkcg->cpd; } static void ioprio_free_cpd(struct blkcg_policy_data *cpd) { struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); kfree(blkcg); } static struct cftype ioprio_files[] = { { .name = "prio.class", .seq_show = ioprio_show_prio_policy, .write = ioprio_set_prio_policy, }, { } /* sentinel */ }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, .legacy_cftypes = ioprio_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, }; void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || blkcg->prio_policy == POLICY_NONE_TO_RT) { /* * For RT threads, the default priority level is 4 because * task_nice is 0. By promoting non-RT io-priority to RT-class * and default level 4, those requests that are already * RT-class but need a higher io-priority can use ioprio_set() * to achieve this. */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); return; } /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); if (prio > bio->bi_ioprio) bio->bi_ioprio = prio; } static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); } static void __exit ioprio_exit(void) { blkcg_policy_unregister(&ioprio_policy); } module_init(ioprio_init); module_exit(ioprio_exit);
1 11 10 6 9 9 8 6 7 2 6 2 2 1 1 2 6 4 3 2 1 2 4 1 11 3 2 1 3 3 2 2 3 1 1 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * Copyright (c) 2012 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/string.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/ip.h> struct nft_nat { u8 sreg_addr_min; u8 sreg_addr_max; u8 sreg_proto_min; u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; }; static void nft_nat_setup_addr(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { switch (priv->family) { case AF_INET: range->min_addr.ip = (__force __be32) regs->data[priv->sreg_addr_min]; range->max_addr.ip = (__force __be32) regs->data[priv->sreg_addr_max]; break; case AF_INET6: memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min], sizeof(range->min_addr.ip6)); memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max], sizeof(range->max_addr.ip6)); break; } } static void nft_nat_setup_proto(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { range->min_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_min]); range->max_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_max]); } static void nft_nat_setup_netmap(struct nf_nat_range2 *range, const struct nft_pktinfo *pkt, const struct nft_nat *priv) { struct sk_buff *skb = pkt->skb; union nf_inet_addr new_addr; __be32 netmask; int i, len = 0; switch (priv->type) { case NFT_NAT_SNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->saddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->saddr; len = sizeof(struct in6_addr); } break; case NFT_NAT_DNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->daddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->daddr; len = sizeof(struct in6_addr); } break; } for (i = 0; i < len / sizeof(__be32); i++) { netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); new_addr.ip6[i] &= ~netmask; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; } range->min_addr = new_addr; range->max_addr = new_addr; } static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); if (priv->flags & NF_NAT_RANGE_NETMAP) nft_nat_setup_netmap(&range, pkt, priv); } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_TYPE] = { .type = NLA_U32 }, [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, [NFTA_NAT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_nat *priv = nft_expr_priv(expr); int err; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; switch (priv->type) { case NFT_NAT_SNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN)); break; case NFT_NAT_DNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)); break; } return err; } static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); unsigned int alen, plen; u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL || (tb[NFTA_NAT_REG_ADDR_MIN] == NULL && tb[NFTA_NAT_REG_PROTO_MIN] == NULL)) return -EINVAL; switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { case NFT_NAT_SNAT: priv->type = NF_NAT_MANIP_SRC; break; case NFT_NAT_DNAT: priv->type = NF_NAT_MANIP_DST; break; default: return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { case NFPROTO_IPV4: alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: if (tb[NFTA_NAT_REG_ADDR_MIN]) return -EAFNOSUPPORT; break; } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) return err; } else { priv->sreg_addr_max = priv->sreg_addr_min; } priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); return nf_ct_netns_get(ctx->net, family); } static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); switch (priv->type) { case NF_NAT_MANIP_SRC: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) goto nla_put_failure; break; case NF_NAT_MANIP_DST: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) goto nla_put_failure; break; } if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) goto nla_put_failure; if (priv->sreg_addr_min) { if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN, priv->sreg_addr_min) || nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX, priv->sreg_addr_max)) goto nla_put_failure; } if (priv->sreg_proto_min) { if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN, priv->sreg_proto_min) || nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX, priv->sreg_proto_max)) goto nla_put_failure; } if (priv->flags != 0) { if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static void nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_nat *priv = nft_expr_priv(expr); nf_ct_netns_put(ctx->net, priv->family); } static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { .name = "nat", .ops = &nft_nat_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_TABLES_INET static void nft_nat_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); if (priv->family == nft_pf(pkt) || priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } static const struct nft_expr_ops nft_nat_inet_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_inet_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { .name = "nat", .family = NFPROTO_INET, .ops = &nft_nat_inet_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; static int nft_nat_inet_module_init(void) { return nft_register_expr(&nft_inet_nat_type); } static void nft_nat_inet_module_exit(void) { nft_unregister_expr(&nft_inet_nat_type); } #else static int nft_nat_inet_module_init(void) { return 0; } static void nft_nat_inet_module_exit(void) { } #endif static int __init nft_nat_module_init(void) { int ret = nft_nat_inet_module_init(); if (ret) return ret; ret = nft_register_expr(&nft_nat_type); if (ret) nft_nat_inet_module_exit(); return ret; } static void __exit nft_nat_module_exit(void) { nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } module_init(nft_nat_module_init); module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); MODULE_ALIAS_NFT_EXPR("nat"); MODULE_DESCRIPTION("Network Address Translation support");
116 3 116 3 254 4 3 7 4 6 4 3 4 3 7 253 115 199 116 116 253 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0 /* * lib/minmax.c: windowed min/max tracker * * Kathleen Nichols' algorithm for tracking the minimum (or maximum) * value of a data stream over some fixed time interval. (E.g., * the minimum RTT over the past five minutes.) It uses constant * space and constant time per update yet almost always delivers * the same minimum as an implementation that has to keep all the * data in the window. * * The algorithm keeps track of the best, 2nd best & 3rd best min * values, maintaining an invariant that the measurement time of * the n'th best >= n-1'th best. It also makes sure that the three * values are widely separated in the time window since that bounds * the worse case error when that data is monotonically increasing * over the window. * * Upon getting a new min, we can forget everything earlier because * it has no value - the new min is <= everything else in the window * by definition and it's the most recent. So we restart fresh on * every new min and overwrites 2nd & 3rd choices. The same property * holds for 2nd & 3rd best. */ #include <linux/module.h> #include <linux/win_minmax.h> /* As time advances, update the 1st, 2nd, and 3rd choices. */ static u32 minmax_subwin_update(struct minmax *m, u32 win, const struct minmax_sample *val) { u32 dt = val->t - m->s[0].t; if (unlikely(dt > win)) { /* * Passed entire window without a new val so make 2nd * choice the new val & 3rd choice the new 2nd choice. * we may have to iterate this since our 2nd choice * may also be outside the window (we checked on entry * that the third choice was in the window). */ m->s[0] = m->s[1]; m->s[1] = m->s[2]; m->s[2] = *val; if (unlikely(val->t - m->s[0].t > win)) { m->s[0] = m->s[1]; m->s[1] = m->s[2]; m->s[2] = *val; } } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { /* * We've passed a quarter of the window without a new val * so take a 2nd choice from the 2nd quarter of the window. */ m->s[2] = m->s[1] = *val; } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { /* * We've passed half the window without finding a new val * so take a 3rd choice from the last half of the window */ m->s[2] = *val; } return m->s[0].v; } /* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; if (unlikely(val.v >= m->s[0].v) || /* found new max? */ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ return minmax_reset(m, t, meas); /* forget earlier samples */ if (unlikely(val.v >= m->s[1].v)) m->s[2] = m->s[1] = val; else if (unlikely(val.v >= m->s[2].v)) m->s[2] = val; return minmax_subwin_update(m, win, &val); } EXPORT_SYMBOL(minmax_running_max); /* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; if (unlikely(val.v <= m->s[0].v) || /* found new min? */ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ return minmax_reset(m, t, meas); /* forget earlier samples */ if (unlikely(val.v <= m->s[1].v)) m->s[2] = m->s[1] = val; else if (unlikely(val.v <= m->s[2].v)) m->s[2] = val; return minmax_subwin_update(m, win, &val); } EXPORT_SYMBOL(minmax_running_min);
403 404 276 159 406 362 363 361 363 362 182 362 363 362 417 417 406 362 36 362 417 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Serge Hallyn <serue@us.ibm.com> * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_queue.c * Implements queues that store template measurements and * maintains aggregate over the stored measurements * in the pre-configured TPM PCR (if available). * The measurement list is append-only. No entry is * ever removed or changed during the boot-cycle. */ #include <linux/rculist.h> #include <linux/reboot.h> #include <linux/slab.h> #include "ima.h" #define AUDIT_CAUSE_LEN_MAX 32 /* pre-allocated array of tpm_digest structures to extend a PCR */ static struct tpm_digest *digests; LIST_HEAD(ima_measurements); /* list of all measurements */ #ifdef CONFIG_IMA_KEXEC static unsigned long binary_runtime_size; #else static unsigned long binary_runtime_size = ULONG_MAX; #endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { .len = ATOMIC_LONG_INIT(0), .violations = ATOMIC_LONG_INIT(0), .queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT }; /* mutex protects atomicity of extending measurement list * and extending the TPM PCR aggregate. Since tpm_extend can take * long (and the tpm driver uses a mutex), we can't use the spinlock. */ static DEFINE_MUTEX(ima_extend_list_mutex); /* * Used internally by the kernel to suspend measurements. * Protected by ima_extend_list_mutex. */ static bool ima_measurements_suspended; /* lookup up the digest value in the hash table, and return the entry */ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, int pcr) { struct ima_queue_entry *qe, *ret = NULL; unsigned int key; int rc; key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; } } rcu_read_unlock(); return ret; } /* * Calculate the memory required for serializing a single * binary_runtime_measurement list entry, which contains a * couple of variable length fields (e.g template name and data). */ static int get_binary_runtime_size(struct ima_template_entry *entry) { int size = 0; size += sizeof(u32); /* pcr */ size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); size += entry->template_data_len; return size; } /* ima_add_template_entry helper function: * - Add template entry to the measurement list and hash table, for * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ static int ima_add_digest_entry(struct ima_template_entry *entry, bool update_htable) { struct ima_queue_entry *qe; unsigned int key; qe = kmalloc(sizeof(*qe), GFP_KERNEL); if (qe == NULL) { pr_err("OUT OF MEMORY ERROR creating queue entry\n"); return -ENOMEM; } qe->entry = entry; INIT_LIST_HEAD(&qe->later); list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); if (update_htable) { key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } if (binary_runtime_size != ULONG_MAX) { int size; size = get_binary_runtime_size(entry); binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? binary_runtime_size + size : ULONG_MAX; } return 0; } /* * Return the amount of memory required for serializing the * entire binary_runtime_measurement list, including the ima_kexec_hdr * structure. */ unsigned long ima_get_binary_runtime_size(void) { if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) return ULONG_MAX; else return binary_runtime_size + sizeof(struct ima_kexec_hdr); } static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; if (!ima_tpm_chip) return result; result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; } /* * Add template entry to the measurement list and hash table, and * extend the pcr. * * On systems which support carrying the IMA measurement list across * kexec, maintain the total memory size required for serializing the * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; int result = 0, tpmresult = 0; mutex_lock(&ima_extend_list_mutex); /* * Avoid appending to the measurement log when the TPM subsystem has * been shut down while preparing for system reboot. */ if (ima_measurements_suspended) { audit_cause = "measurements_suspended"; audit_info = 0; result = -ENODEV; goto out; } if (!violation && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) { if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; goto out; } } result = ima_add_digest_entry(entry, !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; goto out; } if (violation) /* invalidate pcr */ digests_arg = digests; tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); audit_cause = tpm_audit_cause; audit_info = 0; } out: mutex_unlock(&ima_extend_list_mutex); integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } int ima_restore_measurement_entry(struct ima_template_entry *entry) { int result = 0; mutex_lock(&ima_extend_list_mutex); result = ima_add_digest_entry(entry, 0); mutex_unlock(&ima_extend_list_mutex); return result; } static void ima_measurements_suspend(void) { mutex_lock(&ima_extend_list_mutex); ima_measurements_suspended = true; mutex_unlock(&ima_extend_list_mutex); } static int ima_reboot_notifier(struct notifier_block *nb, unsigned long action, void *data) { #ifdef CONFIG_IMA_KEXEC if (action == SYS_RESTART && data && !strcmp(data, "kexec reboot")) ima_measure_kexec_event("kexec_execute"); #endif ima_measurements_suspend(); return NOTIFY_DONE; } static struct notifier_block ima_reboot_nb = { .notifier_call = ima_reboot_notifier, }; void __init ima_init_reboot_notifier(void) { register_reboot_notifier(&ima_reboot_nb); } int __init ima_init_digests(void) { u16 digest_size; u16 crypto_id; int i; if (!ima_tpm_chip) return 0; digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests), GFP_NOFS); if (!digests) return -ENOMEM; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; digest_size = ima_tpm_chip->allocated_banks[i].digest_size; crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (crypto_id == HASH_ALGO__LAST) digest_size = SHA1_DIGEST_SIZE; memset(digests[i].digest, 0xff, digest_size); } return 0; }
37 12 12 12 12 12 12 12 12 12 12 12 36 25 36 24 4 24 8 2 8 8 1 8 7 3 23 21 23 8 7 6 1 7 7 8 3 31 31 3 11 9 9 8 31 8 3 3 3 3 36 36 21 21 9 9 21 1 21 14 9 7 7 6 3 4 3 3 7 34 7 7 7 5 5 7 7 2 2 22 22 174 56 41 32 12 28 23 173 7 7 7 1 1 1 1 1 7 27 27 26 11 11 11 16 16 27 27 27 3 24 24 1 1 1 23 1 23 27 4 4 27 27 5 5 5 4 4 1 4 5 1 4 53 43 53 40 34 4 4 3 4 2 2 2 2 4 33 32 33 33 31 31 28 28 28 28 28 28 2 30 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 31 31 31 33 35 2 34 1 33 33 32 33 2 1 1 31 30 31 31 30 31 31 30 31 31 39 38 34 35 37 38 32 32 32 30 32 13 32 7 13 6 11 3 6 7 5 4 3 1 1 4 13 9 19 2 1 1 1 2 12 12 11 10 10 1 4 1 3 10 8 4 4 3 2 2 1 1 19 20 8 19 18 16 18 17 20 264 11 6 7 11 36 30 29 30 29 26 29 29 29 29 30 4 29 4 3 28 25 24 2 24 14 24 10 24 1 24 1 24 1 24 1 24 3 24 4 24 25 21 21 21 3 20 4 16 16 15 21 9 4 2 1 3 3 9 5 2 4 3 2 2 2 1 148 151 150 151 151 151 151 151 151 151 151 151 150 151 151 14 263 9 9 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev, 0); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, bool strict) { /* For the default ip6tnl0 device, allow changing only the protocol * (the IP6_TNL_F_CAP_PER_PACKET flag is set on ip6tnl0, and all other * parameters are 0). */ if (strict && (!ipv6_addr_any(&p->laddr) || !ipv6_addr_any(&p->raddr) || p->flags != t->parms.flags || p->hop_limit || p->encap_limit || p->flowinfo || p->link || p->fwmark || p->collect_md)) return -EINVAL; t->parms.proto = p->proto; netdev_state_change(t->dev); return 0; } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1, false); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; struct net *net; int err; net = params->link_net ? : dev_net(dev); ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) { if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { /* iproute2 always sets TUNNEL_ENCAP_FLAG_CSUM6, so * let's ignore this flag. */ ipencap.flags &= ~TUNNEL_ENCAP_FLAG_CSUM6; if (memchr_inv(&ipencap, 0, sizeof(ipencap))) { NL_SET_ERR_MSG(extack, "Only protocol can be changed for fallback tunnel, not encap params"); return -EINVAL; } } ip6_tnl_netlink_parms(data, &p); if (ip6_tnl0_update(t, &p, true) < 0) { NL_SET_ERR_MSG(extack, "Only protocol can be changed for fallback tunnel"); return -EINVAL; } return 0; } if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
6 10 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KSTRTOX_H #define _LINUX_KSTRTOX_H #include <linux/compiler.h> #include <linux/types.h> /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); int __must_check _kstrtol(const char *s, unsigned int base, long *res); int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res); int __must_check kstrtoll(const char *s, unsigned int base, long long *res); /** * kstrtoul - convert a string to an unsigned long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoul(). Return code must be checked. */ static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res) { /* * We want to shortcut function call, but * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0. */ if (sizeof(unsigned long) == sizeof(unsigned long long) && __alignof__(unsigned long) == __alignof__(unsigned long long)) return kstrtoull(s, base, (unsigned long long *)res); else return _kstrtoul(s, base, res); } /** * kstrtol - convert a string to a long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign or a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtol(). Return code must be checked. */ static inline int __must_check kstrtol(const char *s, unsigned int base, long *res) { /* * We want to shortcut function call, but * __builtin_types_compatible_p(long, long long) = 0. */ if (sizeof(long) == sizeof(long long) && __alignof__(long) == __alignof__(long long)) return kstrtoll(s, base, (long long *)res); else return _kstrtol(s, base, res); } int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res); int __must_check kstrtoint(const char *s, unsigned int base, int *res); static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res) { return kstrtoull(s, base, res); } static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res) { return kstrtoll(s, base, res); } static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res) { return kstrtouint(s, base, res); } static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res) { return kstrtoint(s, base, res); } int __must_check kstrtou16(const char *s, unsigned int base, u16 *res); int __must_check kstrtos16(const char *s, unsigned int base, s16 *res); int __must_check kstrtou8(const char *s, unsigned int base, u8 *res); int __must_check kstrtos8(const char *s, unsigned int base, s8 *res); int __must_check kstrtobool(const char *s, bool *res); int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res); int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res); int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res); int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res); int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res); int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res); int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res); int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res); int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res); int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res); int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res); static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res) { return kstrtoull_from_user(s, count, base, res); } static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res) { return kstrtoll_from_user(s, count, base, res); } static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res) { return kstrtouint_from_user(s, count, base, res); } static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res) { return kstrtoint_from_user(s, count, base, res); } /* * Use kstrto<foo> instead. * * NOTE: simple_strto<foo> does not check for the range overflow and, * depending on the input, may give interesting results. * * Use these functions if and only if you cannot use kstrto<foo>, because * the conversion ends on the first non-digit character, which may be far * beyond the supported range. It might be useful to parse the strings like * 10x50 or 12:21 without altering original string or temporary buffer in use. * Keep in mind above caveat. */ extern unsigned long simple_strtoul(const char *,char **,unsigned int); extern unsigned long simple_strntoul(const char *,char **,unsigned int,size_t); extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); #endif /* _LINUX_KSTRTOX_H */
39 7 24 37 36 24 40 2 2 33 4 29 29 33 1 32 24 8 32 3 2 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_queue.h> #include <net/ip6_checksum.h> #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if ((protocol != IPPROTO_TCP && protocol != IPPROTO_UDP && !csum_fold(skb->csum)) || !csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len - dataoff, protocol, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } fallthrough; case CHECKSUM_NONE: if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; else skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len - dataoff, protocol, 0); csum = __skb_checksum_complete(skb); } return csum; } EXPORT_SYMBOL(nf_ip_checksum); #endif static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip_checksum(skb, hook, dataoff, protocol); fallthrough; case CHECKSUM_NONE: skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); skb->ip_summed = CHECKSUM_NONE; return __skb_checksum_complete_head(skb, dataoff + len); } return csum; } __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(skb->csum, skb_checksum(skb, 0, dataoff, 0)))) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } fallthrough; case CHECKSUM_NONE: skb->csum = ~csum_unfold( csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(0, skb_checksum(skb, 0, dataoff, 0)))); csum = __skb_checksum_complete(skb); } return csum; } EXPORT_SYMBOL(nf_ip6_checksum); static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); __wsum hsum; __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip6_checksum(skb, hook, dataoff, protocol); fallthrough; case CHECKSUM_NONE: hsum = skb_checksum(skb, 0, dataoff, 0); skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(0, hsum))); skb->ip_summed = CHECKSUM_NONE; return __skb_checksum_complete_head(skb, dataoff + len); } return csum; }; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol, unsigned short family) { __sum16 csum = 0; switch (family) { case AF_INET: csum = nf_ip_checksum(skb, hook, dataoff, protocol); break; case AF_INET6: csum = nf_ip6_checksum(skb, hook, dataoff, protocol); break; } return csum; } EXPORT_SYMBOL_GPL(nf_checksum); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol, unsigned short family) { __sum16 csum = 0; switch (family) { case AF_INET: csum = nf_ip_checksum_partial(skb, hook, dataoff, len, protocol); break; case AF_INET6: csum = nf_ip6_checksum_partial(skb, hook, dataoff, len, protocol); break; } return csum; } EXPORT_SYMBOL_GPL(nf_checksum_partial); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family) { const struct nf_ipv6_ops *v6ops __maybe_unused; int ret = 0; switch (family) { case AF_INET: ret = nf_ip_route(net, dst, fl, strict); break; case AF_INET6: ret = nf_ip6_route(net, dst, fl, strict); break; } return ret; } EXPORT_SYMBOL_GPL(nf_route); /* Only get and check the lengths, not do any hop-by-hop stuff. */ int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen) { int len, off = sizeof(struct ipv6hdr); unsigned char *nh; if (!pskb_may_pull(skb, off + 8)) return -ENOMEM; nh = (unsigned char *)(ipv6_hdr(skb) + 1); len = (nh[1] + 1) << 3; if (!pskb_may_pull(skb, off + len)) return -ENOMEM; nh = skb_network_header(skb); off += 2; len -= 2; while (len > 0) { int optlen; if (nh[off] == IPV6_TLV_PAD1) { off++; len--; continue; } if (len < 2) return -EBADMSG; optlen = nh[off + 1] + 2; if (optlen > len) return -EBADMSG; if (nh[off] == IPV6_TLV_JUMBO) { u32 pkt_len; if (nh[off + 1] != 4 || (off & 3) != 2) return -EBADMSG; pkt_len = ntohl(*(__be32 *)(nh + off + 2)); if (pkt_len <= IPV6_MAXPLEN || ipv6_hdr(skb)->payload_len) return -EBADMSG; if (pkt_len > skb->len - sizeof(struct ipv6hdr)) return -EBADMSG; *plen = pkt_len; } off += optlen; len -= optlen; } return len ? -EBADMSG : 0; } EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
784 784 784 785 2 784 1 785 769 2 1 1 769 1 768 3 768 769 768 2 769 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "rsrc.h" #include "nop.h" struct io_nop { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct file *file; int result; int fd; unsigned int flags; __u64 extra1; __u64 extra2; }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ IORING_NOP_TW | IORING_NOP_CQE32) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); nop->flags = READ_ONCE(sqe->nop_flags); if (nop->flags & ~NOP_FLAGS) return -EINVAL; if (nop->flags & IORING_NOP_INJECT_RESULT) nop->result = READ_ONCE(sqe->len); else nop->result = 0; if (nop->flags & IORING_NOP_FILE) nop->fd = READ_ONCE(sqe->fd); else nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) req->buf_index = READ_ONCE(sqe->buf_index); if (nop->flags & IORING_NOP_CQE32) { struct io_ring_ctx *ctx = req->ctx; if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) return -EINVAL; nop->extra1 = READ_ONCE(sqe->off); nop->extra2 = READ_ONCE(sqe->addr); } return 0; } int io_nop(struct io_kiocb *req, unsigned int issue_flags) { struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); int ret = nop->result; if (nop->flags & IORING_NOP_FILE) { if (nop->flags & IORING_NOP_FIXED_FILE) { req->file = io_file_get_fixed(req, nop->fd, issue_flags); req->flags |= REQ_F_FIXED_FILE; } else { req->file = io_file_get_normal(req, nop->fd); } if (!req->file) { ret = -EBADF; goto done; } } if (nop->flags & IORING_NOP_FIXED_BUFFER) { if (!io_find_buf_node(req, issue_flags)) ret = -EFAULT; } done: if (ret < 0) req_set_fail(req); if (nop->flags & IORING_NOP_CQE32) io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); else io_req_set_res(req, nop->result, 0); if (nop->flags & IORING_NOP_TW) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return IOU_ISSUE_SKIP_COMPLETE; } return IOU_COMPLETE; }
24762 1 86 7669 24598 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 /* SPDX-License-Identifier: GPL-2.0 */ /* Perform sanity checking for object sizes for uaccess.h and uio.h. */ #ifndef __LINUX_UCOPYSIZE_H__ #define __LINUX_UCOPYSIZE_H__ #include <linux/bug.h> #ifdef CONFIG_HARDENED_USERCOPY #include <linux/jump_label.h> extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, validate_usercopy_range); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n) && static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, &validate_usercopy_range)) { __check_object_size(ptr, n, to_user); } } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); void __copy_overflow(int size, unsigned long count); static inline void copy_overflow(int size, unsigned long count) { if (IS_ENABLED(CONFIG_BUG)) __copy_overflow(size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __builtin_object_size(addr, 0); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #endif /* __LINUX_UCOPYSIZE_H__ */
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/bitmap.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) */ #include <linux/buffer_head.h> #include "ext4.h" unsigned int ext4_count_free(char *bitmap, unsigned int numchars) { return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; if (!ext4_has_feature_metadata_csum(sb)) return 1; sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; if (!ext4_has_feature_metadata_csum(sb)) return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; if (!ext4_has_feature_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_feature_metadata_csum(sb)) return; csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); }
39 30 29 28 27 6 2 2 33 34 34 4 3 2 1 30 24 22 22 21 1 20 1 34 61 58 57 57 57 13 14 13 12 11 11 17 17 16 15 11 18 8 6 6 5 8 30 27 29 28 2 27 17 8 39 39 39 38 38 38 33 34 37 35 2 35 35 35 2 33 1 33 4 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - System call implementations and user space interfaces * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI * Copyright © 2021-2025 Microsoft Corporation */ #include <asm/current.h> #include <linux/anon_inodes.h> #include <linux/bitops.h> #include <linux/build_bug.h> #include <linux/capability.h> #include <linux/cleanup.h> #include <linux/compiler_types.h> #include <linux/dcache.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/sched.h> #include <linux/security.h> #include <linux/stddef.h> #include <linux/syscalls.h> #include <linux/types.h> #include <linux/uaccess.h> #include <uapi/linux/landlock.h> #include "cred.h" #include "domain.h" #include "fs.h" #include "limits.h" #include "net.h" #include "ruleset.h" #include "setup.h" static bool is_initialized(void) { if (likely(landlock_initialized)) return true; pr_warn_once( "Disabled but requested by user space. " "You should enable Landlock at boot time: " "https://docs.kernel.org/userspace-api/landlock.html#boot-time-configuration\n"); return false; } /** * copy_min_struct_from_user - Safe future-proof argument copying * * Extend copy_struct_from_user() to check for consistent user buffer. * * @dst: Kernel space pointer or NULL. * @ksize: Actual size of the data pointed to by @dst. * @ksize_min: Minimal required size to be copied. * @src: User space pointer or NULL. * @usize: (Alleged) size of the data pointed to by @src. */ static __always_inline int copy_min_struct_from_user(void *const dst, const size_t ksize, const size_t ksize_min, const void __user *const src, const size_t usize) { /* Checks buffer inconsistencies. */ BUILD_BUG_ON(!dst); if (!src) return -EFAULT; /* Checks size ranges. */ BUILD_BUG_ON(ksize <= 0); BUILD_BUG_ON(ksize < ksize_min); if (usize < ksize_min) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; /* Copies user buffer and fills with zeros. */ return copy_struct_from_user(dst, ksize, src, usize); } /* * This function only contains arithmetic operations with constants, leading to * BUILD_BUG_ON(). The related code is evaluated and checked at build time, * but it is then ignored thanks to compiler optimizations. */ static void build_check_abi(void) { struct landlock_ruleset_attr ruleset_attr; struct landlock_path_beneath_attr path_beneath_attr; struct landlock_net_port_attr net_port_attr; size_t ruleset_size, path_beneath_size, net_port_size; /* * For each user space ABI structures, first checks that there is no * hole in them, then checks that all architectures have the same * struct size. */ ruleset_size = sizeof(ruleset_attr.handled_access_fs); ruleset_size += sizeof(ruleset_attr.handled_access_net); ruleset_size += sizeof(ruleset_attr.scoped); BUILD_BUG_ON(sizeof(ruleset_attr) != ruleset_size); BUILD_BUG_ON(sizeof(ruleset_attr) != 24); path_beneath_size = sizeof(path_beneath_attr.allowed_access); path_beneath_size += sizeof(path_beneath_attr.parent_fd); BUILD_BUG_ON(sizeof(path_beneath_attr) != path_beneath_size); BUILD_BUG_ON(sizeof(path_beneath_attr) != 12); net_port_size = sizeof(net_port_attr.allowed_access); net_port_size += sizeof(net_port_attr.port); BUILD_BUG_ON(sizeof(net_port_attr) != net_port_size); BUILD_BUG_ON(sizeof(net_port_attr) != 16); } /* Ruleset handling */ static int fop_ruleset_release(struct inode *const inode, struct file *const filp) { struct landlock_ruleset *ruleset = filp->private_data; landlock_put_ruleset(ruleset); return 0; } static ssize_t fop_dummy_read(struct file *const filp, char __user *const buf, const size_t size, loff_t *const ppos) { /* Dummy handler to enable FMODE_CAN_READ. */ return -EINVAL; } static ssize_t fop_dummy_write(struct file *const filp, const char __user *const buf, const size_t size, loff_t *const ppos) { /* Dummy handler to enable FMODE_CAN_WRITE. */ return -EINVAL; } /* * A ruleset file descriptor enables to build a ruleset by adding (i.e. * writing) rule after rule, without relying on the task's context. This * reentrant design is also used in a read way to enforce the ruleset on the * current task. */ static const struct file_operations ruleset_fops = { .release = fop_ruleset_release, .read = fop_dummy_read, .write = fop_dummy_write, }; /* * The Landlock ABI version should be incremented for each new Landlock-related * user space visible change (e.g. Landlock syscalls). This version should * only be incremented once per Linux release, and the date in * Documentation/userspace-api/landlock.rst should be updated to reflect the * UAPI change. */ const int landlock_abi_version = 7; /** * sys_landlock_create_ruleset - Create a new ruleset * * @attr: Pointer to a &struct landlock_ruleset_attr identifying the scope of * the new ruleset. * @size: Size of the pointed &struct landlock_ruleset_attr (needed for * backward and forward compatibility). * @flags: Supported values: * * - %LANDLOCK_CREATE_RULESET_VERSION * - %LANDLOCK_CREATE_RULESET_ERRATA * * This system call enables to create a new Landlock ruleset, and returns the * related file descriptor on success. * * If %LANDLOCK_CREATE_RULESET_VERSION or %LANDLOCK_CREATE_RULESET_ERRATA is * set, then @attr must be NULL and @size must be 0. * * Possible returned errors are: * * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; * - %EINVAL: unknown @flags, or unknown access, or unknown scope, or too small @size; * - %E2BIG: @attr or @size inconsistencies; * - %EFAULT: @attr or @size inconsistencies; * - %ENOMSG: empty &landlock_ruleset_attr.handled_access_fs. * * .. kernel-doc:: include/uapi/linux/landlock.h * :identifiers: landlock_create_ruleset_flags */ SYSCALL_DEFINE3(landlock_create_ruleset, const struct landlock_ruleset_attr __user *const, attr, const size_t, size, const __u32, flags) { struct landlock_ruleset_attr ruleset_attr; struct landlock_ruleset *ruleset; int err, ruleset_fd; /* Build-time checks. */ build_check_abi(); if (!is_initialized()) return -EOPNOTSUPP; if (flags) { if (attr || size) return -EINVAL; if (flags == LANDLOCK_CREATE_RULESET_VERSION) return landlock_abi_version; if (flags == LANDLOCK_CREATE_RULESET_ERRATA) return landlock_errata; return -EINVAL; } /* Copies raw user space buffer. */ err = copy_min_struct_from_user(&ruleset_attr, sizeof(ruleset_attr), offsetofend(typeof(ruleset_attr), handled_access_fs), attr, size); if (err) return err; /* Checks content (and 32-bits cast). */ if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) != LANDLOCK_MASK_ACCESS_FS) return -EINVAL; /* Checks network content (and 32-bits cast). */ if ((ruleset_attr.handled_access_net | LANDLOCK_MASK_ACCESS_NET) != LANDLOCK_MASK_ACCESS_NET) return -EINVAL; /* Checks IPC scoping content (and 32-bits cast). */ if ((ruleset_attr.scoped | LANDLOCK_MASK_SCOPE) != LANDLOCK_MASK_SCOPE) return -EINVAL; /* Checks arguments and transforms to kernel struct. */ ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs, ruleset_attr.handled_access_net, ruleset_attr.scoped); if (IS_ERR(ruleset)) return PTR_ERR(ruleset); /* Creates anonymous FD referring to the ruleset. */ ruleset_fd = anon_inode_getfd("[landlock-ruleset]", &ruleset_fops, ruleset, O_RDWR | O_CLOEXEC); if (ruleset_fd < 0) landlock_put_ruleset(ruleset); return ruleset_fd; } /* * Returns an owned ruleset from a FD. It is thus needed to call * landlock_put_ruleset() on the return value. */ static struct landlock_ruleset *get_ruleset_from_fd(const int fd, const fmode_t mode) { CLASS(fd, ruleset_f)(fd); struct landlock_ruleset *ruleset; if (fd_empty(ruleset_f)) return ERR_PTR(-EBADF); /* Checks FD type and access right. */ if (fd_file(ruleset_f)->f_op != &ruleset_fops) return ERR_PTR(-EBADFD); if (!(fd_file(ruleset_f)->f_mode & mode)) return ERR_PTR(-EPERM); ruleset = fd_file(ruleset_f)->private_data; if (WARN_ON_ONCE(ruleset->num_layers != 1)) return ERR_PTR(-EINVAL); landlock_get_ruleset(ruleset); return ruleset; } /* Path handling */ /* * @path: Must call put_path(@path) after the call if it succeeded. */ static int get_path_from_fd(const s32 fd, struct path *const path) { CLASS(fd_raw, f)(fd); BUILD_BUG_ON(!__same_type( fd, ((struct landlock_path_beneath_attr *)NULL)->parent_fd)); if (fd_empty(f)) return -EBADF; /* * Forbids ruleset FDs, internal filesystems (e.g. nsfs), including * pseudo filesystems that will never be mountable (e.g. sockfs, * pipefs). */ if ((fd_file(f)->f_op == &ruleset_fops) || (fd_file(f)->f_path.mnt->mnt_flags & MNT_INTERNAL) || (fd_file(f)->f_path.dentry->d_sb->s_flags & SB_NOUSER) || IS_PRIVATE(d_backing_inode(fd_file(f)->f_path.dentry))) return -EBADFD; *path = fd_file(f)->f_path; path_get(path); return 0; } static int add_rule_path_beneath(struct landlock_ruleset *const ruleset, const void __user *const rule_attr) { struct landlock_path_beneath_attr path_beneath_attr; struct path path; int res, err; access_mask_t mask; /* Copies raw user space buffer. */ res = copy_from_user(&path_beneath_attr, rule_attr, sizeof(path_beneath_attr)); if (res) return -EFAULT; /* * Informs about useless rule: empty allowed_access (i.e. deny rules) * are ignored in path walks. */ if (!path_beneath_attr.allowed_access) return -ENOMSG; /* Checks that allowed_access matches the @ruleset constraints. */ mask = ruleset->access_masks[0].fs; if ((path_beneath_attr.allowed_access | mask) != mask) return -EINVAL; /* Gets and checks the new rule. */ err = get_path_from_fd(path_beneath_attr.parent_fd, &path); if (err) return err; /* Imports the new rule. */ err = landlock_append_fs_rule(ruleset, &path, path_beneath_attr.allowed_access); path_put(&path); return err; } static int add_rule_net_port(struct landlock_ruleset *ruleset, const void __user *const rule_attr) { struct landlock_net_port_attr net_port_attr; int res; access_mask_t mask; /* Copies raw user space buffer. */ res = copy_from_user(&net_port_attr, rule_attr, sizeof(net_port_attr)); if (res) return -EFAULT; /* * Informs about useless rule: empty allowed_access (i.e. deny rules) * are ignored by network actions. */ if (!net_port_attr.allowed_access) return -ENOMSG; /* Checks that allowed_access matches the @ruleset constraints. */ mask = landlock_get_net_access_mask(ruleset, 0); if ((net_port_attr.allowed_access | mask) != mask) return -EINVAL; /* Denies inserting a rule with port greater than 65535. */ if (net_port_attr.port > U16_MAX) return -EINVAL; /* Imports the new rule. */ return landlock_append_net_rule(ruleset, net_port_attr.port, net_port_attr.allowed_access); } /** * sys_landlock_add_rule - Add a new rule to a ruleset * * @ruleset_fd: File descriptor tied to the ruleset that should be extended * with the new rule. * @rule_type: Identify the structure type pointed to by @rule_attr: * %LANDLOCK_RULE_PATH_BENEATH or %LANDLOCK_RULE_NET_PORT. * @rule_attr: Pointer to a rule (matching the @rule_type). * @flags: Must be 0. * * This system call enables to define a new rule and add it to an existing * ruleset. * * Possible returned errors are: * * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; * - %EAFNOSUPPORT: @rule_type is %LANDLOCK_RULE_NET_PORT but TCP/IP is not * supported by the running kernel; * - %EINVAL: @flags is not 0; * - %EINVAL: The rule accesses are inconsistent (i.e. * &landlock_path_beneath_attr.allowed_access or * &landlock_net_port_attr.allowed_access is not a subset of the ruleset * handled accesses) * - %EINVAL: &landlock_net_port_attr.port is greater than 65535; * - %ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access is * 0); * - %EBADF: @ruleset_fd is not a file descriptor for the current thread, or a * member of @rule_attr is not a file descriptor as expected; * - %EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of * @rule_attr is not the expected file descriptor type; * - %EPERM: @ruleset_fd has no write access to the underlying ruleset; * - %EFAULT: @rule_attr was not a valid address. */ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd, const enum landlock_rule_type, rule_type, const void __user *const, rule_attr, const __u32, flags) { struct landlock_ruleset *ruleset __free(landlock_put_ruleset) = NULL; if (!is_initialized()) return -EOPNOTSUPP; /* No flag for now. */ if (flags) return -EINVAL; /* Gets and checks the ruleset. */ ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_WRITE); if (IS_ERR(ruleset)) return PTR_ERR(ruleset); switch (rule_type) { case LANDLOCK_RULE_PATH_BENEATH: return add_rule_path_beneath(ruleset, rule_attr); case LANDLOCK_RULE_NET_PORT: return add_rule_net_port(ruleset, rule_attr); default: return -EINVAL; } } /* Enforcement */ /** * sys_landlock_restrict_self - Enforce a ruleset on the calling thread * * @ruleset_fd: File descriptor tied to the ruleset to merge with the target. * @flags: Supported values: * * - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF * - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON * - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF * * This system call enables to enforce a Landlock ruleset on the current * thread. Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its * namespace or is running with no_new_privs. This avoids scenarios where * unprivileged tasks can affect the behavior of privileged children. * * Possible returned errors are: * * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; * - %EINVAL: @flags contains an unknown bit. * - %EBADF: @ruleset_fd is not a file descriptor for the current thread; * - %EBADFD: @ruleset_fd is not a ruleset file descriptor; * - %EPERM: @ruleset_fd has no read access to the underlying ruleset, or the * current thread is not running with no_new_privs, or it doesn't have * %CAP_SYS_ADMIN in its namespace. * - %E2BIG: The maximum number of stacked rulesets is reached for the current * thread. * * .. kernel-doc:: include/uapi/linux/landlock.h * :identifiers: landlock_restrict_self_flags */ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32, flags) { struct landlock_ruleset *new_dom, *ruleset __free(landlock_put_ruleset) = NULL; struct cred *new_cred; struct landlock_cred_security *new_llcred; bool __maybe_unused log_same_exec, log_new_exec, log_subdomains, prev_log_subdomains; if (!is_initialized()) return -EOPNOTSUPP; /* * Similar checks as for seccomp(2), except that an -EPERM may be * returned. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; if ((flags | LANDLOCK_MASK_RESTRICT_SELF) != LANDLOCK_MASK_RESTRICT_SELF) return -EINVAL; /* Translates "off" flag to boolean. */ log_same_exec = !(flags & LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF); /* Translates "on" flag to boolean. */ log_new_exec = !!(flags & LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON); /* Translates "off" flag to boolean. */ log_subdomains = !(flags & LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF); /* * It is allowed to set LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF with * -1 as ruleset_fd, but no other flag must be set. */ if (!(ruleset_fd == -1 && flags == LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF)) { /* Gets and checks the ruleset. */ ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_READ); if (IS_ERR(ruleset)) return PTR_ERR(ruleset); } /* Prepares new credentials. */ new_cred = prepare_creds(); if (!new_cred) return -ENOMEM; new_llcred = landlock_cred(new_cred); #ifdef CONFIG_AUDIT prev_log_subdomains = !new_llcred->log_subdomains_off; new_llcred->log_subdomains_off = !prev_log_subdomains || !log_subdomains; #endif /* CONFIG_AUDIT */ /* * The only case when a ruleset may not be set is if * LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF is set and ruleset_fd is -1. * We could optimize this case by not calling commit_creds() if this flag * was already set, but it is not worth the complexity. */ if (!ruleset) return commit_creds(new_cred); /* * There is no possible race condition while copying and manipulating * the current credentials because they are dedicated per thread. */ new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset); if (IS_ERR(new_dom)) { abort_creds(new_cred); return PTR_ERR(new_dom); } #ifdef CONFIG_AUDIT new_dom->hierarchy->log_same_exec = log_same_exec; new_dom->hierarchy->log_new_exec = log_new_exec; if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains) new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED; #endif /* CONFIG_AUDIT */ /* Replaces the old (prepared) domain. */ landlock_put_ruleset(new_llcred->domain); new_llcred->domain = new_dom; #ifdef CONFIG_AUDIT new_llcred->domain_exec |= BIT(new_dom->num_layers - 1); #endif /* CONFIG_AUDIT */ return commit_creds(new_cred); }
162 162 162 55 55 4 55 25 5 21 19 5 5 4 20 64 64 64 26 26 12 64 62 21 5 7 64 7 64 64 63 64 64 64 22 67 67 67 60 58 60 99 15 15 7 6 1 7 6 7 4 15 7 3 3 3 3 3 3 3 4 44 11 11 11 2 10 1 1 9 5 4 1 8 3 3 2 6 3 4 2 4 2 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Priority Queue * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> */ #include <linux/time.h> #include <linux/slab.h> #include <sound/core.h> #include "seq_timer.h" #include "seq_prioq.h" /* Implementation is a simple linked list for now... This priority queue orders the events on timestamp. For events with an equeal timestamp the queue behaves as a FIFO. * * +-------+ * Head --> | first | * +-------+ * |next * +-----v-+ * | | * +-------+ * | * +-----v-+ * | | * +-------+ * | * +-----v-+ * Tail --> | last | * +-------+ * */ /* create new prioq (constructor) */ struct snd_seq_prioq *snd_seq_prioq_new(void) { struct snd_seq_prioq *f; f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) return NULL; spin_lock_init(&f->lock); f->head = NULL; f->tail = NULL; f->cells = 0; return f; } /* delete prioq (destructor) */ void snd_seq_prioq_delete(struct snd_seq_prioq **fifo) { struct snd_seq_prioq *f = *fifo; *fifo = NULL; if (f == NULL) { pr_debug("ALSA: seq: snd_seq_prioq_delete() called with NULL prioq\n"); return; } /* release resources...*/ /*....................*/ if (f->cells > 0) { /* drain prioQ */ while (f->cells > 0) snd_seq_cell_free(snd_seq_prioq_cell_out(f, NULL)); } kfree(f); } /* compare timestamp between events */ /* return 1 if a >= b; 0 */ static inline int compare_timestamp(struct snd_seq_event *a, struct snd_seq_event *b) { if ((a->flags & SNDRV_SEQ_TIME_STAMP_MASK) == SNDRV_SEQ_TIME_STAMP_TICK) { /* compare ticks */ return (snd_seq_compare_tick_time(&a->time.tick, &b->time.tick)); } else { /* compare real time */ return (snd_seq_compare_real_time(&a->time.time, &b->time.time)); } } /* compare timestamp between events */ /* return negative if a < b; * zero if a = b; * positive if a > b; */ static inline int compare_timestamp_rel(struct snd_seq_event *a, struct snd_seq_event *b) { if ((a->flags & SNDRV_SEQ_TIME_STAMP_MASK) == SNDRV_SEQ_TIME_STAMP_TICK) { /* compare ticks */ if (a->time.tick > b->time.tick) return 1; else if (a->time.tick == b->time.tick) return 0; else return -1; } else { /* compare real time */ if (a->time.time.tv_sec > b->time.time.tv_sec) return 1; else if (a->time.time.tv_sec == b->time.time.tv_sec) { if (a->time.time.tv_nsec > b->time.time.tv_nsec) return 1; else if (a->time.time.tv_nsec == b->time.time.tv_nsec) return 0; else return -1; } else return -1; } } /* enqueue cell to prioq */ int snd_seq_prioq_cell_in(struct snd_seq_prioq * f, struct snd_seq_event_cell * cell) { struct snd_seq_event_cell *cur, *prev; int count; int prior; if (snd_BUG_ON(!f || !cell)) return -EINVAL; /* check flags */ prior = (cell->event.flags & SNDRV_SEQ_PRIORITY_MASK); guard(spinlock_irqsave)(&f->lock); /* check if this element needs to inserted at the end (ie. ordered data is inserted) This will be very likeley if a sequencer application or midi file player is feeding us (sequential) data */ if (f->tail && !prior) { if (compare_timestamp(&cell->event, &f->tail->event)) { /* add new cell to tail of the fifo */ f->tail->next = cell; f->tail = cell; cell->next = NULL; f->cells++; return 0; } } /* traverse list of elements to find the place where the new cell is to be inserted... Note that this is a order n process ! */ prev = NULL; /* previous cell */ cur = f->head; /* cursor */ count = 10000; /* FIXME: enough big, isn't it? */ while (cur != NULL) { /* compare timestamps */ int rel = compare_timestamp_rel(&cell->event, &cur->event); if (rel < 0) /* new cell has earlier schedule time, */ break; else if (rel == 0 && prior) /* equal schedule time and prior to others */ break; /* new cell has equal or larger schedule time, */ /* move cursor to next cell */ prev = cur; cur = cur->next; if (! --count) { pr_err("ALSA: seq: cannot find a pointer.. infinite loop?\n"); return -EINVAL; } } /* insert it before cursor */ if (prev != NULL) prev->next = cell; cell->next = cur; if (f->head == cur) /* this is the first cell, set head to it */ f->head = cell; if (cur == NULL) /* reached end of the list */ f->tail = cell; f->cells++; return 0; } /* return 1 if the current time >= event timestamp */ static int event_is_ready(struct snd_seq_event *ev, void *current_time) { if ((ev->flags & SNDRV_SEQ_TIME_STAMP_MASK) == SNDRV_SEQ_TIME_STAMP_TICK) return snd_seq_compare_tick_time(current_time, &ev->time.tick); else return snd_seq_compare_real_time(current_time, &ev->time.time); } /* dequeue cell from prioq */ struct snd_seq_event_cell *snd_seq_prioq_cell_out(struct snd_seq_prioq *f, void *current_time) { struct snd_seq_event_cell *cell; if (f == NULL) { pr_debug("ALSA: seq: snd_seq_prioq_cell_in() called with NULL prioq\n"); return NULL; } guard(spinlock_irqsave)(&f->lock); cell = f->head; if (cell && current_time && !event_is_ready(&cell->event, current_time)) cell = NULL; if (cell) { f->head = cell->next; /* reset tail if this was the last element */ if (f->tail == cell) f->tail = NULL; cell->next = NULL; f->cells--; } return cell; } /* return number of events available in prioq */ int snd_seq_prioq_avail(struct snd_seq_prioq * f) { if (f == NULL) { pr_debug("ALSA: seq: snd_seq_prioq_cell_in() called with NULL prioq\n"); return 0; } return f->cells; } /* remove cells matching with the condition */ static void prioq_remove_cells(struct snd_seq_prioq *f, bool (*match)(struct snd_seq_event_cell *cell, void *arg), void *arg) { register struct snd_seq_event_cell *cell, *next; struct snd_seq_event_cell *prev = NULL; struct snd_seq_event_cell *freefirst = NULL, *freeprev = NULL, *freenext; /* collect all removed cells */ scoped_guard(spinlock_irqsave, &f->lock) { for (cell = f->head; cell; cell = next) { next = cell->next; if (!match(cell, arg)) { prev = cell; continue; } /* remove cell from prioq */ if (cell == f->head) f->head = cell->next; else prev->next = cell->next; if (cell == f->tail) f->tail = cell->next; f->cells--; /* add cell to free list */ cell->next = NULL; if (freefirst == NULL) freefirst = cell; else freeprev->next = cell; freeprev = cell; } } /* remove selected cells */ while (freefirst) { freenext = freefirst->next; snd_seq_cell_free(freefirst); freefirst = freenext; } } struct prioq_match_arg { int client; int timestamp; }; static inline bool prioq_match(struct snd_seq_event_cell *cell, void *arg) { struct prioq_match_arg *v = arg; if (cell->event.source.client == v->client || cell->event.dest.client == v->client) return true; if (!v->timestamp) return false; switch (cell->event.flags & SNDRV_SEQ_TIME_STAMP_MASK) { case SNDRV_SEQ_TIME_STAMP_TICK: if (cell->event.time.tick) return true; break; case SNDRV_SEQ_TIME_STAMP_REAL: if (cell->event.time.time.tv_sec || cell->event.time.time.tv_nsec) return true; break; } return false; } /* remove cells for left client */ void snd_seq_prioq_leave(struct snd_seq_prioq *f, int client, int timestamp) { struct prioq_match_arg arg = { client, timestamp }; return prioq_remove_cells(f, prioq_match, &arg); } struct prioq_remove_match_arg { int client; struct snd_seq_remove_events *info; }; static bool prioq_remove_match(struct snd_seq_event_cell *cell, void *arg) { struct prioq_remove_match_arg *v = arg; struct snd_seq_event *ev = &cell->event; struct snd_seq_remove_events *info = v->info; int res; if (ev->source.client != v->client) return false; if (info->remove_mode & SNDRV_SEQ_REMOVE_DEST) { if (ev->dest.client != info->dest.client || ev->dest.port != info->dest.port) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_DEST_CHANNEL) { if (! snd_seq_ev_is_channel_type(ev)) return false; /* data.note.channel and data.control.channel are identical */ if (ev->data.note.channel != info->channel) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_AFTER) { if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_TICK) res = snd_seq_compare_tick_time(&ev->time.tick, &info->time.tick); else res = snd_seq_compare_real_time(&ev->time.time, &info->time.time); if (!res) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_BEFORE) { if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_TICK) res = snd_seq_compare_tick_time(&ev->time.tick, &info->time.tick); else res = snd_seq_compare_real_time(&ev->time.time, &info->time.time); if (res) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_EVENT_TYPE) { if (ev->type != info->type) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_IGNORE_OFF) { /* Do not remove off events */ switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEOFF: /* case SNDRV_SEQ_EVENT_SAMPLE_STOP: */ return false; default: break; } } if (info->remove_mode & SNDRV_SEQ_REMOVE_TAG_MATCH) { if (info->tag != ev->tag) return false; } return true; } /* remove cells matching remove criteria */ void snd_seq_prioq_remove_events(struct snd_seq_prioq * f, int client, struct snd_seq_remove_events *info) { struct prioq_remove_match_arg arg = { client, info }; return prioq_remove_cells(f, prioq_remove_match, &arg); }
36 36 37 36 9 9 8 9 37 3 14 3 3 1 4 2 26 2 23 17 10 8 8 8 14 24 35 17 22 39 24 24 24 12 12 23 18 12 19 5 3 11 19 23 4 4 2 2 4 4 12 12 12 13 12 10 9 11 10 8 7 6 2 4 8 12 13 8 4 4 1 24 24 23 11 6 3 7 21 21 6 15 21 1 20 20 24 32 31 32 26 25 13 24 24 12 12 24 2 1 1 24 24 24 15 12 11 4 2 1 11 35 32 23 9 35 15 15 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 // SPDX-License-Identifier: GPL-2.0 /* * fs/timerfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * * * Thanks to Thomas Gleixner for code reviews and useful comments. * */ #include <linux/alarmtimer.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/time.h> #include <linux/hrtimer.h> #include <linux/anon_inodes.h> #include <linux/timerfd.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/rcupdate.h> #include <linux/time_namespace.h> struct timerfd_ctx { union { struct hrtimer tmr; struct alarm alarm; } t; ktime_t tintv; ktime_t moffs; wait_queue_head_t wqh; u64 ticks; int clockid; short unsigned expired; short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; spinlock_t cancel_lock; bool might_cancel; }; static LIST_HEAD(cancel_list); static DEFINE_SPINLOCK(cancel_lock); static inline bool isalarm(struct timerfd_ctx *ctx) { return ctx->clockid == CLOCK_REALTIME_ALARM || ctx->clockid == CLOCK_BOOTTIME_ALARM; } /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, * tintv != 0) until the timer is accessed. */ static void timerfd_triggered(struct timerfd_ctx *ctx) { unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1; ctx->ticks++; wake_up_locked_poll(&ctx->wqh, EPOLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); } static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) { struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, t.tmr); timerfd_triggered(ctx); return HRTIMER_NORESTART; } static void timerfd_alarmproc(struct alarm *alarm, ktime_t now) { struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, t.alarm); timerfd_triggered(ctx); } /* * Called when the clock was set to cancel the timers in the cancel * list. This will wake up processes waiting on these timers. The * wake-up requires ctx->ticks to be non zero, therefore we increment * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { ktime_t moffs = ktime_mono_to_real(0); struct timerfd_ctx *ctx; unsigned long flags; rcu_read_lock(); list_for_each_entry_rcu(ctx, &cancel_list, clist) { if (!ctx->might_cancel) continue; spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs != moffs) { ctx->moffs = KTIME_MAX; ctx->ticks++; wake_up_locked_poll(&ctx->wqh, EPOLLIN); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); } rcu_read_unlock(); } static void timerfd_resume_work(struct work_struct *work) { timerfd_clock_was_set(); } static DECLARE_WORK(timerfd_work, timerfd_resume_work); /* * Invoked from timekeeping_resume(). Defer the actual update to work so * timerfd_clock_was_set() runs in task context. */ void timerfd_resume(void) { schedule_work(&timerfd_work); } static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { ctx->might_cancel = false; spin_lock(&cancel_lock); list_del_rcu(&ctx->clist); spin_unlock(&cancel_lock); } } static void timerfd_remove_cancel(struct timerfd_ctx *ctx) { spin_lock(&ctx->cancel_lock); __timerfd_remove_cancel(ctx); spin_unlock(&ctx->cancel_lock); } static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs != KTIME_MAX) return false; ctx->moffs = ktime_mono_to_real(0); return true; } static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { spin_lock(&ctx->cancel_lock); if ((ctx->clockid == CLOCK_REALTIME || ctx->clockid == CLOCK_REALTIME_ALARM) && (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { if (!ctx->might_cancel) { ctx->might_cancel = true; spin_lock(&cancel_lock); list_add_rcu(&ctx->clist, &cancel_list); spin_unlock(&cancel_lock); } } else { __timerfd_remove_cancel(ctx); } spin_unlock(&ctx->cancel_lock); } static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { ktime_t remaining; if (isalarm(ctx)) remaining = alarm_expires_remaining(&ctx->t.alarm); else remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr); return remaining < 0 ? 0: remaining; } static int timerfd_setup(struct timerfd_ctx *ctx, int flags, const struct itimerspec64 *ktmr) { enum hrtimer_mode htmode; ktime_t texp; int clockid = ctx->clockid; htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; texp = timespec64_to_ktime(ktmr->it_value); ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec64_to_ktime(ktmr->it_interval); if (isalarm(ctx)) { alarm_init(&ctx->t.alarm, ctx->clockid == CLOCK_REALTIME_ALARM ? ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); } if (texp != 0) { if (flags & TFD_TIMER_ABSTIME) texp = timens_ktime_to_host(clockid, texp); if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); else alarm_start_relative(&ctx->t.alarm, texp); } else { hrtimer_start(&ctx->t.tmr, texp, htmode); } if (timerfd_canceled(ctx)) return -ECANCELED; } ctx->settime_flags = flags & TFD_SETTIME_FLAGS; return 0; } static int timerfd_release(struct inode *inode, struct file *file) { struct timerfd_ctx *ctx = file->private_data; timerfd_remove_cancel(ctx); if (isalarm(ctx)) alarm_cancel(&ctx->t.alarm); else hrtimer_cancel(&ctx->t.tmr); kfree_rcu(ctx, rcu); return 0; } static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; __poll_t events = 0; unsigned long flags; poll_wait(file, &ctx->wqh, wait); spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->ticks) events |= EPOLLIN; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return events; } static ssize_t timerfd_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct timerfd_ctx *ctx = file->private_data; ssize_t res; u64 ticks = 0; if (iov_iter_count(to) < sizeof(ticks)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT) res = -EAGAIN; else res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); /* * If clock has changed, we do not care about the * ticks and we do not rearm the timer. Userspace must * reevaluate anyway. */ if (timerfd_canceled(ctx)) { ctx->ticks = 0; ctx->expired = 0; res = -ECANCELED; } if (ctx->ticks) { ticks = ctx->ticks; if (ctx->expired && ctx->tintv) { /* * If tintv != 0, this is a periodic timer that * needs to be re-armed. We avoid doing it in the timer * callback to avoid DoS attacks specifying a very * short timer period. */ if (isalarm(ctx)) { ticks += alarm_forward_now( &ctx->t.alarm, ctx->tintv) - 1; alarm_restart(&ctx->t.alarm); } else { ticks += hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->t.tmr); } } ctx->expired = 0; ctx->ticks = 0; } spin_unlock_irq(&ctx->wqh.lock); if (ticks) { res = copy_to_iter(&ticks, sizeof(ticks), to); if (!res) res = -EFAULT; } return res; } #ifdef CONFIG_PROC_FS static void timerfd_show(struct seq_file *m, struct file *file) { struct timerfd_ctx *ctx = file->private_data; struct timespec64 value, interval; spin_lock_irq(&ctx->wqh.lock); value = ktime_to_timespec64(timerfd_get_remaining(ctx)); interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "clockid: %d\n" "ticks: %llu\n" "settime flags: 0%o\n" "it_value: (%llu, %llu)\n" "it_interval: (%llu, %llu)\n", ctx->clockid, (unsigned long long)ctx->ticks, ctx->settime_flags, (unsigned long long)value.tv_sec, (unsigned long long)value.tv_nsec, (unsigned long long)interval.tv_sec, (unsigned long long)interval.tv_nsec); } #else #define timerfd_show NULL #endif #ifdef CONFIG_CHECKPOINT_RESTORE static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct timerfd_ctx *ctx = file->private_data; int ret = 0; switch (cmd) { case TFD_IOC_SET_TICKS: { u64 ticks; if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks))) return -EFAULT; if (!ticks) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (!timerfd_canceled(ctx)) { ctx->ticks = ticks; wake_up_locked_poll(&ctx->wqh, EPOLLIN); } else ret = -ECANCELED; spin_unlock_irq(&ctx->wqh.lock); break; } default: ret = -ENOTTY; break; } return ret; } #else #define timerfd_ioctl NULL #endif static const struct file_operations timerfd_fops = { .release = timerfd_release, .poll = timerfd_poll, .read_iter = timerfd_read_iter, .llseek = noop_llseek, .show_fdinfo = timerfd_show, .unlocked_ioctl = timerfd_ioctl, }; SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; struct timerfd_ctx *ctx; struct file *file; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); if ((flags & ~TFD_CREATE_FLAGS) || (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; if ((clockid == CLOCK_REALTIME_ALARM || clockid == CLOCK_BOOTTIME_ALARM) && !capable(CAP_WAKE_ALARM)) return -EPERM; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; init_waitqueue_head(&ctx->wqh); spin_lock_init(&ctx->cancel_lock); ctx->clockid = clockid; if (isalarm(ctx)) alarm_init(&ctx->t.alarm, ctx->clockid == CLOCK_REALTIME_ALARM ? ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); ufd = get_unused_fd_flags(flags & TFD_SHARED_FCNTL_FLAGS); if (ufd < 0) { kfree(ctx); return ufd; } file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } fd_install(ufd, file); return ufd; } static int do_timerfd_settime(int ufd, int flags, const struct itimerspec64 *new, struct itimerspec64 *old) { struct timerfd_ctx *ctx; int ret; if ((flags & ~TFD_SETTIME_FLAGS) || !itimerspec64_valid(new)) return -EINVAL; CLASS(fd, f)(ufd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &timerfd_fops) return -EINVAL; ctx = fd_file(f)->private_data; if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) return -EPERM; timerfd_setup_cancel(ctx, flags); /* * We need to stop the existing timer before reprogramming * it to the new values. */ for (;;) { spin_lock_irq(&ctx->wqh.lock); if (isalarm(ctx)) { if (alarm_try_to_cancel(&ctx->t.alarm) >= 0) break; } else { if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0) break; } spin_unlock_irq(&ctx->wqh.lock); if (isalarm(ctx)) hrtimer_cancel_wait_running(&ctx->t.alarm.timer); else hrtimer_cancel_wait_running(&ctx->t.tmr); } /* * If the timer is expired and it's periodic, we need to advance it * because the caller may want to know the previous expiration time. * We do not update "ticks" and "expired" since the timer will be * re-programmed again in the following timerfd_setup() call. */ if (ctx->expired && ctx->tintv) { if (isalarm(ctx)) alarm_forward_now(&ctx->t.alarm, ctx->tintv); else hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); } old->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); old->it_interval = ktime_to_timespec64(ctx->tintv); /* * Re-program the timer to the new value ... */ ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); return ret; } static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { struct timerfd_ctx *ctx; CLASS(fd, f)(ufd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &timerfd_fops) return -EINVAL; ctx = fd_file(f)->private_data; spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv) { ctx->expired = 0; if (isalarm(ctx)) { ctx->ticks += alarm_forward_now( &ctx->t.alarm, ctx->tintv) - 1; alarm_restart(&ctx->t.alarm); } else { ctx->ticks += hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->t.tmr); } } t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); return 0; } SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct __kernel_itimerspec __user *, utmr, struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 new, old; int ret; if (get_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; if (otmr && put_itimerspec64(&old, otmr)) return -EFAULT; return ret; } SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags, const struct old_itimerspec32 __user *, utmr, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 new, old; int ret; if (get_old_itimerspec32(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; if (otmr && put_old_itimerspec32(&old, otmr)) return -EFAULT; return ret; } SYSCALL_DEFINE2(timerfd_gettime32, int, ufd, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; return put_old_itimerspec32(&kotmr, otmr) ? -EFAULT : 0; } #endif
8 8 5 5 5 20 9 9 9 9 23 23 9 18 28 28 28 22 23 23 23 23 9 1 9 9 6 6 6 6 5 5 5 2 2 2 1 1 1 1 1 1 1 1 5 5 9 9 6 6 1 1 1 5 4 9 4 3 4 1 3 3 1 22 22 22 22 22 22 22 22 22 4 4 4 18 18 18 18 18 18 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. * Copyright (C) 2014 Marvell International Ltd. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/nfc.h> #include "nfc.h" #include "llcp.h" static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; static LIST_HEAD(llcp_devices); /* Protects llcp_devices list */ static DEFINE_SPINLOCK(llcp_devices_lock); static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock) { sock->remote_rw = LLCP_DEFAULT_RW; sock->remote_miu = LLCP_MAX_MIU + 1; } static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local = sock->local; struct sk_buff *s, *tmp; skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); if (local == NULL) return; /* Search for local pending SKBs that are related to this socket */ skb_queue_walk_safe(&local->tx_queue, s, tmp) { if (s->sk != &sock->sk) continue; skb_unlink(s, &local->tx_queue); kfree_skb(s); } } static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, int err) { struct sock *sk; struct hlist_node *tmp; struct nfc_llcp_sock *llcp_sock; skb_queue_purge(&local->tx_queue); write_lock(&local->sockets.lock); sk_for_each_safe(sk, tmp, &local->sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; bh_lock_sock(accept_sk); nfc_llcp_accept_unlink(accept_sk); if (err) accept_sk->sk_err = err; accept_sk->sk_state = LLCP_CLOSED; accept_sk->sk_state_change(sk); bh_unlock_sock(accept_sk); } } if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->sockets.lock); /* If we still have a device, we keep the RAW sockets alive */ if (device == true) return; write_lock(&local->raw_sockets.lock); sk_for_each_safe(sk, tmp, &local->raw_sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->raw_sockets.lock); } static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever * we hold a reference to local, we also need to hold a reference to * the device to avoid UAF. */ if (!nfc_get_device(local->dev->idx)) return NULL; kref_get(&local->ref); return local; } static void local_cleanup(struct nfc_llcp_local *local) { nfc_llcp_socket_release(local, false, ENXIO); timer_delete_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; timer_delete_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); } static void local_release(struct kref *ref) { struct nfc_llcp_local *local; local = container_of(ref, struct nfc_llcp_local, ref); local_cleanup(local); kfree(local); } int nfc_llcp_local_put(struct nfc_llcp_local *local) { struct nfc_dev *dev; int ret; if (local == NULL) return 0; dev = local->dev; ret = kref_put(&local->ref, local_release); nfc_put_device(dev); return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, u8 ssap, u8 dsap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("ssap dsap %d %d\n", ssap, dsap); if (ssap == 0 && dsap == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { llcp_sock = tmp_sock; sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); return llcp_sock; } static void nfc_llcp_sock_put(struct nfc_llcp_sock *sock) { sock_put(&sock->sk); } static void nfc_llcp_timeout_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, timeout_work); nfc_dep_link_down(local->dev); } static void nfc_llcp_symm_timer(struct timer_list *t) { struct nfc_llcp_local *local = timer_container_of(local, t, link_timer); pr_err("SYMM timeout\n"); schedule_work(&local->timeout_work); } static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) { unsigned long time; HLIST_HEAD(nl_sdres_list); struct hlist_node *n; struct nfc_llcp_sdp_tlv *sdp; struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, sdreq_timeout_work); mutex_lock(&local->sdreq_lock); time = jiffies - msecs_to_jiffies(3 * local->remote_lto); hlist_for_each_entry_safe(sdp, n, &local->pending_sdreqs, node) { if (time_after(sdp->time, time)) continue; sdp->sap = LLCP_SDP_UNBOUND; hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); } if (!hlist_empty(&local->pending_sdreqs)) mod_timer(&local->sdreq_timer, jiffies + msecs_to_jiffies(3 * local->remote_lto)); mutex_unlock(&local->sdreq_lock); if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); } static void nfc_llcp_sdreq_timer(struct timer_list *t) { struct nfc_llcp_local *local = timer_container_of(local, t, sdreq_timer); schedule_work(&local->sdreq_timeout_work); } struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { struct nfc_llcp_local *local; struct nfc_llcp_local *res = NULL; spin_lock(&llcp_devices_lock); list_for_each_entry(local, &llcp_devices, list) if (local->dev == dev) { res = nfc_llcp_local_get(local); break; } spin_unlock(&llcp_devices_lock); return res; } static struct nfc_llcp_local *nfc_llcp_remove_local(struct nfc_dev *dev) { struct nfc_llcp_local *local, *tmp; spin_lock(&llcp_devices_lock); list_for_each_entry_safe(local, tmp, &llcp_devices, list) if (local->dev == dev) { list_del(&local->list); spin_unlock(&llcp_devices_lock); return local; } spin_unlock(&llcp_devices_lock); pr_warn("Shutting down device not found\n"); return NULL; } static char *wks[] = { NULL, NULL, /* SDP */ "urn:nfc:sn:ip", "urn:nfc:sn:obex", "urn:nfc:sn:snep", }; static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; pr_debug("%s\n", service_name); if (service_name == NULL) return -EINVAL; num_wks = ARRAY_SIZE(wks); for (sap = 0; sap < num_wks; sap++) { if (wks[sap] == NULL) continue; if (strncmp(wks[sap], service_name, service_name_len) == 0) return sap; } return -EINVAL; } static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len, bool needref) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("sn %zd %p\n", sn_len, sn); if (sn == NULL || sn_len == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); pr_debug("llcp sock %p\n", tmp_sock); if (tmp_sock->sk.sk_type == SOCK_STREAM && tmp_sock->sk.sk_state != LLCP_LISTEN) continue; if (tmp_sock->sk.sk_type == SOCK_DGRAM && tmp_sock->sk.sk_state != LLCP_BOUND) continue; if (tmp_sock->service_name == NULL || tmp_sock->service_name_len == 0) continue; if (tmp_sock->service_name_len != sn_len) continue; if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) { llcp_sock = tmp_sock; if (needref) sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); pr_debug("Found llcp sock %p\n", llcp_sock); return llcp_sock; } u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, struct nfc_llcp_sock *sock) { mutex_lock(&local->sdp_lock); if (sock->service_name != NULL && sock->service_name_len > 0) { int ssap = nfc_llcp_wks_sap(sock->service_name, sock->service_name_len); if (ssap > 0) { pr_debug("WKS %d\n", ssap); /* This is a WKS, let's check if it's free */ if (test_bit(ssap, &local->local_wks)) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return ssap; } /* * Check if there already is a non WKS socket bound * to this service name. */ if (nfc_llcp_sock_from_sn(local, sock->service_name, sock->service_name_len, false) != NULL) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } mutex_unlock(&local->sdp_lock); return LLCP_SDP_UNBOUND; } else if (sock->ssap != 0 && sock->ssap < LLCP_WKS_NUM_SAP) { if (!test_bit(sock->ssap, &local->local_wks)) { set_bit(sock->ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return sock->ssap; } } mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } u8 nfc_llcp_get_local_ssap(struct nfc_llcp_local *local) { u8 local_ssap; mutex_lock(&local->sdp_lock); local_ssap = find_first_zero_bit(&local->local_sap, LLCP_LOCAL_NUM_SAP); if (local_ssap == LLCP_LOCAL_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(local_ssap, &local->local_sap); mutex_unlock(&local->sdp_lock); return local_ssap + LLCP_LOCAL_SAP_OFFSET; } void nfc_llcp_put_ssap(struct nfc_llcp_local *local, u8 ssap) { u8 local_ssap; unsigned long *sdp; if (ssap < LLCP_WKS_NUM_SAP) { local_ssap = ssap; sdp = &local->local_wks; } else if (ssap < LLCP_LOCAL_NUM_SAP) { atomic_t *client_cnt; local_ssap = ssap - LLCP_WKS_NUM_SAP; sdp = &local->local_sdp; client_cnt = &local->local_sdp_cnt[local_ssap]; pr_debug("%d clients\n", atomic_read(client_cnt)); mutex_lock(&local->sdp_lock); if (atomic_dec_and_test(client_cnt)) { struct nfc_llcp_sock *l_sock; pr_debug("No more clients for SAP %d\n", ssap); clear_bit(local_ssap, sdp); /* Find the listening sock and set it back to UNBOUND */ l_sock = nfc_llcp_sock_get(local, ssap, LLCP_SAP_SDP); if (l_sock) { l_sock->ssap = LLCP_SDP_UNBOUND; nfc_llcp_sock_put(l_sock); } } mutex_unlock(&local->sdp_lock); return; } else if (ssap < LLCP_MAX_SAP) { local_ssap = ssap - LLCP_LOCAL_NUM_SAP; sdp = &local->local_sap; } else { return; } mutex_lock(&local->sdp_lock); clear_bit(local_ssap, sdp); mutex_unlock(&local->sdp_lock); } static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local) { u8 ssap; mutex_lock(&local->sdp_lock); ssap = find_first_zero_bit(&local->local_sdp, LLCP_SDP_NUM_SAP); if (ssap == LLCP_SDP_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } pr_debug("SDP ssap %d\n", LLCP_WKS_NUM_SAP + ssap); set_bit(ssap, &local->local_sdp); mutex_unlock(&local->sdp_lock); return LLCP_WKS_NUM_SAP + ssap; } static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; int ret = 0; version = LLCP_VERSION_11; version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version, 1, &version_length); if (!version_tlv) { ret = -ENOMEM; goto out; } gb_len += version_length; lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, &lto_length); if (!lto_tlv) { ret = -ENOMEM; goto out; } gb_len += lto_length; pr_debug("Local wks 0x%lx\n", local->local_wks); wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length); if (!wks_tlv) { ret = -ENOMEM; goto out; } gb_len += wks_length; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, &miux_length); if (!miux_tlv) { ret = -ENOMEM; goto out; } gb_len += miux_length; gb_len += ARRAY_SIZE(llcp_magic); if (gb_len > NFC_MAX_GT_LEN) { ret = -EINVAL; goto out; } gb_cur = local->gb; memcpy(gb_cur, llcp_magic, ARRAY_SIZE(llcp_magic)); gb_cur += ARRAY_SIZE(llcp_magic); memcpy(gb_cur, version_tlv, version_length); gb_cur += version_length; memcpy(gb_cur, lto_tlv, lto_length); gb_cur += lto_length; memcpy(gb_cur, wks_tlv, wks_length); gb_cur += wks_length; memcpy(gb_cur, miux_tlv, miux_length); gb_cur += miux_length; local->gb_len = gb_len; out: kfree(version_tlv); kfree(lto_tlv); kfree(wks_tlv); kfree(miux_tlv); return ret; } u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { *general_bytes_len = 0; return NULL; } nfc_llcp_build_gb(local); *general_bytes_len = local->gb_len; nfc_llcp_local_put(local); return local->gb; } int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; int err; if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN) return -EINVAL; local = nfc_llcp_find_local(dev); if (local == NULL) { pr_err("No LLCP device\n"); return -ENODEV; } memset(local->remote_gb, 0, NFC_MAX_GT_LEN); memcpy(local->remote_gb, gb, gb_len); local->remote_gb_len = gb_len; if (memcmp(local->remote_gb, llcp_magic, 3)) { pr_err("MAC does not support LLCP\n"); err = -EINVAL; goto out; } err = nfc_llcp_parse_gb_tlv(local, &local->remote_gb[3], local->remote_gb_len - 3); out: nfc_llcp_local_put(local); return err; } static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } static void nfc_llcp_set_nrns(struct nfc_llcp_sock *sock, struct sk_buff *pdu) { pdu->data[2] = (sock->send_n << 4) | (sock->recv_n); sock->send_n = (sock->send_n + 1) % 16; sock->recv_ack_n = (sock->recv_n - 1) % 16; } void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction) { struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&local->raw_sockets.lock); sk_for_each(sk, &local->raw_sockets.head) { if (sk->sk_state != LLCP_BOUND) continue; if (skb_copy == NULL) { skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, GFP_ATOMIC, true); if (skb_copy == NULL) continue; data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = local->dev ? local->dev->idx : 0xFF; data[1] = direction & 0x01; data[1] |= (RAW_PAYLOAD_LLCP << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&local->raw_sockets.lock); kfree_skb(skb_copy); } static void nfc_llcp_tx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, tx_work); struct sk_buff *skb; struct sock *sk; struct nfc_llcp_sock *llcp_sock; skb = skb_dequeue(&local->tx_queue); if (skb != NULL) { sk = skb->sk; llcp_sock = nfc_llcp_sock(sk); if (llcp_sock == NULL && nfc_llcp_ptype(skb) == LLCP_PDU_I) { kfree_skb(skb); nfc_llcp_send_symm(local->dev); } else if (llcp_sock && !llcp_sock->remote_ready) { skb_queue_head(&local->tx_queue, skb); nfc_llcp_send_symm(local->dev); } else { struct sk_buff *copy_skb = NULL; u8 ptype = nfc_llcp_ptype(skb); int ret; pr_debug("Sending pending skb\n"); print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); if (ptype == LLCP_PDU_I) copy_skb = skb_copy(skb, GFP_ATOMIC); __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); ret = nfc_data_exchange(local->dev, local->target_idx, skb, nfc_llcp_recv, local); if (ret) { kfree_skb(copy_skb); goto out; } if (ptype == LLCP_PDU_I && copy_skb) skb_queue_tail(&llcp_sock->tx_pending_queue, copy_skb); } } else { nfc_llcp_send_symm(local->dev); } out: mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(2 * local->remote_lto)); } static struct nfc_llcp_sock *nfc_llcp_connecting_sock_get(struct nfc_llcp_local *local, u8 ssap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; read_lock(&local->connecting_sockets.lock); sk_for_each(sk, &local->connecting_sockets.head) { llcp_sock = nfc_llcp_sock(sk); if (llcp_sock->ssap == ssap) { sock_hold(&llcp_sock->sk); goto out; } } llcp_sock = NULL; out: read_unlock(&local->connecting_sockets.lock); return llcp_sock; } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len) { return nfc_llcp_sock_from_sn(local, sn, sn_len, true); } static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { u8 type, length; const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); if (type == LLCP_TLV_SN) { *sn_len = length; return &tlv[2]; } offset += length + 2; tlv += length + 2; } return NULL; } static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct nfc_llcp_ui_cb *ui_cb; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ui_cb = nfc_llcp_ui_skb_cb(skb); ui_cb->dsap = dsap; ui_cb->ssap = ssap; pr_debug("%d %d\n", dsap, ssap); /* We're looking for a bound socket, not a client one */ llcp_sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (llcp_sock == NULL || llcp_sock->sk.sk_type != SOCK_DGRAM) return; /* There is no sequence with UI frames */ skb_pull(skb, LLCP_HEADER_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * UI frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP) { sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (sock == NULL || sock->sk.sk_state != LLCP_LISTEN) { reason = LLCP_DM_NOBOUND; goto fail; } } else { const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); if (sn == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } pr_debug("Service name length %zu\n", sn_len); sock = nfc_llcp_sock_get_sn(local, sn, sn_len); if (sock == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } } lock_sock(&sock->sk); parent = &sock->sk; if (sk_acceptq_is_full(parent)) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } if (sock->ssap == LLCP_SDP_UNBOUND) { u8 ssap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("First client, reserving %d\n", ssap); if (ssap == LLCP_SAP_MAX) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } sock->ssap = ssap; } new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock = nfc_llcp_sock(new_sk); new_sock->local = nfc_llcp_local_get(local); if (!new_sock->local) { reason = LLCP_DM_REJ; sock_put(&new_sock->sk); release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; new_sock->dsap = ssap; new_sock->target_idx = local->target_idx; new_sock->parent = parent; new_sock->ssap = sock->ssap; if (sock->ssap < LLCP_LOCAL_NUM_SAP && sock->ssap >= LLCP_WKS_NUM_SAP) { atomic_t *client_count; pr_debug("reserved_ssap %d for %p\n", sock->ssap, new_sock); client_count = &local->local_sdp_cnt[sock->ssap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); new_sock->reserved_ssap = sock->ssap; } nfc_llcp_parse_connection_tlv(new_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); pr_debug("new sock %p sk %p\n", new_sock, &new_sock->sk); nfc_llcp_sock_link(&local->sockets, new_sk); nfc_llcp_accept_enqueue(&sock->sk, new_sk); nfc_get_device(local->dev->idx); new_sk->sk_state = LLCP_CONNECTED; /* Wake the listening processes */ parent->sk_data_ready(parent); /* Send CC */ nfc_llcp_send_cc(new_sock); release_sock(&sock->sk); sock_put(&sock->sk); return; fail: /* Send DM */ nfc_llcp_send_dm(local, dsap, ssap, reason); } int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock) { int nr_frames = 0; struct nfc_llcp_local *local = sock->local; pr_debug("Remote ready %d tx queue len %d remote rw %d", sock->remote_ready, skb_queue_len(&sock->tx_pending_queue), sock->remote_rw); /* Try to queue some I frames for transmission */ while (sock->remote_ready && skb_queue_len(&sock->tx_pending_queue) < sock->remote_rw) { struct sk_buff *pdu; pdu = skb_dequeue(&sock->tx_queue); if (pdu == NULL) break; /* Update N(S)/N(R) */ nfc_llcp_set_nrns(sock, pdu); skb_queue_tail(&local->tx_queue, pdu); nr_frames++; } return nr_frames; } static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, ptype, ns, nr; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ns = nfc_llcp_ns(skb); nr = nfc_llcp_nr(skb); pr_debug("%d %d R %d S %d\n", dsap, ssap, nr, ns); llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } /* Pass the payload upstream */ if (ptype == LLCP_PDU_I) { pr_debug("I frame, queueing on %p\n", &llcp_sock->sk); if (ns == llcp_sock->recv_n) llcp_sock->recv_n = (llcp_sock->recv_n + 1) % 16; else pr_err("Received out of sequence I PDU\n"); skb_pull(skb, LLCP_HEADER_SIZE + LLCP_SEQUENCE_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * I frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } } /* Remove skbs from the pending queue */ if (llcp_sock->send_ack_n != nr) { struct sk_buff *s, *tmp; u8 n; llcp_sock->send_ack_n = nr; /* Remove and free all skbs until ns == nr */ skb_queue_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { n = nfc_llcp_ns(s); skb_unlink(s, &llcp_sock->tx_pending_queue); kfree_skb(s); if (n == nr) break; } /* Re-queue the remaining skbs for transmission */ skb_queue_reverse_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { skb_unlink(s, &llcp_sock->tx_pending_queue); skb_queue_head(&local->tx_queue, s); } } if (ptype == LLCP_PDU_RR) llcp_sock->remote_ready = true; else if (ptype == LLCP_PDU_RNR) llcp_sock->remote_ready = false; if (nfc_llcp_queue_i_frames(llcp_sock) == 0 && ptype == LLCP_PDU_I) nfc_llcp_send_rr(llcp_sock); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); if ((dsap == 0) && (ssap == 0)) { pr_debug("Connection termination"); nfc_dep_link_down(local->dev); return; } llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } if (sk->sk_state == LLCP_CONNECTED) { nfc_put_device(local->dev); sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); } nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_DISC); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); if (llcp_sock == NULL) { pr_err("Invalid CC\n"); nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; /* Unlink from connecting and link to the client array */ nfc_llcp_sock_unlink(&local->connecting_sockets, sk); nfc_llcp_sock_link(&local->sockets, sk); llcp_sock->dsap = ssap; nfc_llcp_parse_connection_tlv(llcp_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); sk->sk_state = LLCP_CONNECTED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); reason = skb->data[2]; pr_debug("%d %d reason %d\n", ssap, dsap, reason); switch (reason) { case LLCP_DM_NOBOUND: case LLCP_DM_REJ: llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); break; default: llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); break; } if (llcp_sock == NULL) { pr_debug("Already closed\n"); return; } sk = &llcp_sock->sk; sk->sk_err = ENXIO; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; u8 dsap, ssap, type, length, tid, sap; const u8 *tlv; u16 tlv_len, offset; const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); size_t sdres_tlvs_len; HLIST_HEAD(nl_sdres_list); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP || ssap != LLCP_SAP_SDP) { pr_err("Wrong SNL SAP\n"); return; } tlv = &skb->data[LLCP_HEADER_SIZE]; tlv_len = skb->len - LLCP_HEADER_SIZE; offset = 0; sdres_tlvs_len = 0; while (offset < tlv_len) { type = tlv[0]; length = tlv[1]; switch (type) { case LLCP_TLV_SDREQ: tid = tlv[2]; service_name = (char *) &tlv[3]; service_name_len