Total coverage: 102204 (6%)of 1876304
4 1 4 1 1 1 40 40 40 37 37 37 170 174 17 17 17 17 17 37 20 17 17 17 37 17 20 36 37 37 36 36 37 36 37 34 1 2 40 46 47 55 34 34 34 34 34 34 5 34 34 30 30 12 12 8 8 18 18 54 55 55 55 55 55 55 55 46 40 6 41 5 4 4 4 2 112 1 2 111 110 22 86 28 47 15 2 4 15 1 2 2 3 2 5 4 40 41 3 2 36 41 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * AF_SMC protocol family socket handler keeping the AF_INET sock address type * applies to SOCK_STREAM sockets only * offers an alternative communication option for TCP-protocol sockets * applicable with RoCE-cards only * * Initial restrictions: * - support for alternate links postponed * * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> * based on prototype from Frank Blaschka */ #define pr_fmt(fmt) "smc: " fmt #include <linux/module.h> #include <linux/socket.h> #include <linux/workqueue.h> #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> #include <linux/rcupdate_wait.h> #include <linux/ctype.h> #include <linux/splice.h> #include <net/sock.h> #include <net/inet_common.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #endif #include <net/tcp.h> #include <net/smc.h> #include <asm/ioctls.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc.h" #include "smc_clc.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_inet.h" #include "smc_hs_bpf.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server */ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); void *hdr; if (cb_ctx->pos[0]) goto out; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_DUMP_HS_LIMITATION); if (!hdr) return -ENOMEM; if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, sock_net(skb->sk)->smc.limit_smc_hs)) goto err; genlmsg_end(skb, hdr); cb_ctx->pos[0] = 1; out: return skb->len; err: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) { sock_net(skb->sk)->smc.limit_smc_hs = true; return 0; } int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) { sock_net(skb->sk)->smc.limit_smc_hs = false; return 0; } static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct smc_sock *smc; struct sock *child; smc = smc_clcsock_user_data(sk); if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > sk->sk_max_ack_backlog) goto drop; if (sk_acceptq_is_full(&smc->sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } /* passthrough to original syn recv sock fct */ child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); /* child must not inherit smc or its ops */ if (child) { rcu_assign_sk_user_data(child, NULL); /* v4-mapped sockets don't inherit parent ops. Don't restore. */ if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops) inet_csk(child)->icsk_af_ops = smc->ori_af_ops; } return child; drop: dst_release(dst); tcp_listendrop(sk); return NULL; } static bool smc_hs_congested(const struct sock *sk) { const struct smc_sock *smc; smc = smc_clcsock_user_data(sk); if (!smc) return true; if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) return true; return false; } struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; head = &h->ht; write_lock_bh(&h->lock); sk_add_node(sk, head); write_unlock_bh(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } void smc_unhash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; write_lock_bh(&h->lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the * BH context */ void smc_release_cb(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); if (smc->conn.tx_in_release_sock) { smc_tx_pending(&smc->conn); smc->conn.tx_in_release_sock = false; } } struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto); struct proto smc_proto6 = { .name = "SMC6", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto6); static void smc_fback_restore_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = NULL; smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); write_unlock_bh(&clcsk->sk_callback_lock); } static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; smc_fback_restore_callbacks(smc); } } static int __smc_release(struct smc_sock *smc) { struct sock *sk = &smc->sk; int rc = 0; if (!smc->use_fallback) { rc = smc_close_active(smc); smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { if (sk->sk_state != SMC_CLOSED) { if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) sock_put(sk); /* passive closing */ if (sk->sk_state == SMC_LISTEN) { /* wake up clcsock accept */ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); } smc_restore_fallback_changes(smc); } sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { if (smc->clcsock) { release_sock(sk); smc_clcsock_release(smc); lock_sock(sk); } if (!smc->use_fallback) smc_conn_free(&smc->conn); } return rc; } int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; int old_state, rc = 0; if (!sk) goto out; sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); old_state = sk->sk_state; /* cleanup for a dangling non-blocking connect */ if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again */ lock_sock_nested(sk, SINGLE_DEPTH_NESTING); else lock_sock(sk); if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && !smc->use_fallback) smc_close_active_abort(smc); rc = __smc_release(smc); /* detach socket */ sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; } static void smc_destruct(struct sock *sk) { if (sk->sk_state != SMC_CLOSED) return; if (!sock_flag(sk, SOCK_DEAD)) return; switch (sk->sk_family) { case AF_INET: inet_sock_destruct(sk); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: inet6_sock_destruct(sk); break; #endif } } static struct lock_class_key smc_key; static struct lock_class_key smc_slock_key; void smc_sk_init(struct net *net, struct sock *sk, int protocol) { struct smc_sock *smc = smc_sk(sk); sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); sock_lock_init_class_and_name(sk, "slock-AF_SMC", &smc_slock_key, "sk_lock-AF_SMC", &smc_key); spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); smc->limit_smc_hs = net->smc.limit_smc_hs; smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; smc_close_init(smc); } static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, int protocol) { struct proto *prot; struct sock *sk; prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); if (!sk) return NULL; sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ smc_sk_init(net, sk, protocol); return sk; } int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); /* replicate tests from inet_bind(), to be safe wrt. future changes */ rc = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) goto out; rc = -EAFNOSUPPORT; if (addr->sin_family != AF_INET && addr->sin_family != AF_INET6 && addr->sin_family != AF_UNSPEC) goto out; /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ if (addr->sin_family == AF_UNSPEC && addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; lock_sock(sk); /* Check if socket is already active */ rc = -EINVAL; if (sk->sk_state != SMC_INIT || smc->connect_nonblock) goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; rc = kernel_bind(smc->clcsock, uaddr, addr_len); out_rel: release_sock(sk); out: return rc; } /* copy only relevant settings and flags of SOL_SOCKET level from smc to * clc socket (since smc is not called for these options from net/core) */ #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ (1UL << SOCK_LINGER) | \ (1UL << SOCK_BROADCAST) | \ (1UL << SOCK_TIMESTAMP) | \ (1UL << SOCK_DBG) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVTSTAMPNS) | \ (1UL << SOCK_LOCALROUTE) | \ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ (1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_WIFI_STATUS) | \ (1UL << SOCK_NOFCS) | \ (1UL << SOCK_FILTER_LOCKED) | \ (1UL << SOCK_TSTAMP_NEW)) /* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, unsigned long mask) { nsk->sk_userlocks = osk->sk_userlocks; if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) nsk->sk_sndbuf = osk->sk_sndbuf; if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) nsk->sk_rcvbuf = osk->sk_rcvbuf; } static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, unsigned long mask) { /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; nsk->sk_sndtimeo = READ_ONCE(osk->sk_sndtimeo); nsk->sk_rcvtimeo = READ_ONCE(osk->sk_rcvtimeo); nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_priority = READ_ONCE(osk->sk_priority); nsk->sk_rcvlowat = osk->sk_rcvlowat; nsk->sk_bound_dev_if = osk->sk_bound_dev_if; nsk->sk_err = osk->sk_err; nsk->sk_flags &= ~mask; nsk->sk_flags |= osk->sk_flags & mask; smc_adjust_sock_bufsizes(nsk, osk, mask); } static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); } #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ (1UL << SOCK_LINGER) | \ (1UL << SOCK_DBG)) /* copy only settings and flags relevant for smc from clc to smc socket */ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) { smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } /* register the new vzalloced sndbuf on all links */ static int smcr_lgr_reg_sndbufs(struct smc_link *link, struct smc_buf_desc *snd_desc) { struct smc_link_group *lgr = link->lgr; int i, rc = 0; if (!snd_desc->is_vm) return -EINVAL; /* protect against parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); if (rc) break; } up_write(&lgr->llc_conf_mutex); return rc; } /* register the new rmb on all links */ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; bool do_slow = false; int i, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; if (!rmb_desc->is_reg_mr[link->link_idx]) { up_read(&lgr->llc_conf_mutex); goto slow_path; } } /* mr register already */ goto fast_path; slow_path: do_slow = true; /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; } fast_path: /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { rc = -EFAULT; goto out; } rmb_desc->is_conf_rkey = true; out: do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; struct smc_llc_qentry *qentry; int rc; /* Receive CONFIRM LINK request from server over RoCE fabric. * Increasing the client's timeout by twice as much as the server's * timeout by default can temporarily avoid decline messages of * both sides crossing or colliding */ qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); if (rc) return SMC_CLC_DECL_ERR_RDYLNK; smc_wr_remember_qp_attr(link); /* reg the sndbuf if it was vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } /* reg the rmb */ if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); if (link->lgr->max_links > 1) { /* optional 2nd link, receive ADD LINK request from server */ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); if (rc == -EAGAIN) rc = 0; /* no DECLINE received, go with one link */ return rc; } smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); smc_llc_cli_add_link(link, qentry); } return 0; } static bool smc_isascii(char *hostname) { int i; for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) if (!isascii(hostname[i])) return false; return true; } static void smc_conn_save_peer_info_fce(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { struct smc_clc_first_contact_ext *fce; int clc_v2_len; if (clc->hdr.version == SMC_V1 || !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) return; if (smc->conn.lgr->is_smcd) { memcpy(smc->conn.lgr->negotiated_eid, clc->d1.eid, SMC_MAX_EID_LEN); clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1); } else { memcpy(smc->conn.lgr->negotiated_eid, clc->r1.eid, SMC_MAX_EID_LEN); clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1); } fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); smc->conn.lgr->peer_os = fce->os_type; smc->conn.lgr->peer_smc_release = fce->release; if (smc_isascii(fce->hostname)) memcpy(smc->conn.lgr->peer_hostname, fce->hostname, SMC_MAX_HOSTNAME_LEN); } static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; smc->conn.peer_token = ntohll(clc->d0.token); /* msg header takes up space in the buffer */ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; } static void smc_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { if (smc->conn.lgr->is_smcd) smcd_conn_save_peer_info(smc, clc); else smcr_conn_save_peer_info(smc, clc); smc_conn_save_peer_info_fce(smc, clc); } static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc, struct smc_init_info *ini) { link->peer_qpn = ntoh24(clc->r0.qpn); memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, struct smc_stats_fback *fback_arr) { int cnt; for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { if (fback_arr[cnt].fback_code == smc->fallback_rsn) { fback_arr[cnt].count++; break; } if (!fback_arr[cnt].fback_code) { fback_arr[cnt].fback_code = smc->fallback_rsn; fback_arr[cnt].count++; break; } } } static void smc_stat_fallback(struct smc_sock *smc) { struct net *net = sock_net(&smc->sk); mutex_lock(&net->smc.mutex_fback_rsn); if (smc->listen_smc) { smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); net->smc.fback_rsn->srv_fback_cnt++; } else { smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); net->smc.fback_rsn->clnt_fback_cnt++; } mutex_unlock(&net->smc.mutex_fback_rsn); } /* must be called under rcu read lock */ static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) { struct socket_wq *wq; __poll_t flags; wq = rcu_dereference(smc->sk.sk_wq); if (!skwq_has_sleeper(wq)) return; /* wake up smc sk->sk_wq */ if (!key) { /* sk_state_change */ wake_up_interruptible_all(&wq->wait); } else { flags = key_to_poll(key); if (flags & (EPOLLIN | EPOLLOUT)) /* sk_data_ready or sk_write_space */ wake_up_interruptible_sync_poll(&wq->wait, flags); else if (flags & EPOLLERR) /* sk_error_report */ wake_up_interruptible_poll(&wq->wait, flags); } } static int smc_fback_mark_woken(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct smc_mark_woken *mark = container_of(wait, struct smc_mark_woken, wait_entry); mark->woken = true; mark->key = key; return 0; } static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, void (*clcsock_callback)(struct sock *sk)) { struct smc_mark_woken mark = { .woken = false }; struct socket_wq *wq; init_waitqueue_func_entry(&mark.wait_entry, smc_fback_mark_woken); rcu_read_lock(); wq = rcu_dereference(clcsk->sk_wq); if (!wq) goto out; add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); clcsock_callback(clcsk); remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); if (mark.woken) smc_fback_wakeup_waitqueue(smc, mark.key); out: rcu_read_unlock(); } static void smc_fback_state_change(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_data_ready(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_write_space(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_error_report(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, &smc->clcsk_state_change); smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, &smc->clcsk_data_ready); smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, &smc->clcsk_write_space); smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, &smc->clcsk_error_report); write_unlock_bh(&clcsk->sk_callback_lock); } static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { int rc = 0; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { rc = -EBADF; goto out; } smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); trace_smc_switch_to_fallback(smc, reason_code); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; smc->sk.sk_socket->wq.fasync_list = NULL; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up * as clcsock's wait queue is woken up. */ smc_fback_replace_callbacks(smc); } out: mutex_unlock(&smc->clcsock_release_lock); return rc; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { struct net *net = sock_net(&smc->sk); int rc = 0; rc = smc_switch_to_fallback(smc, reason_code); if (rc) { /* fallback fails */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; } /* decline and fall back during connect */ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, u8 version) { struct net *net = sock_net(&smc->sk); int rc; if (reason_code < 0) { /* error, fallback is not possible */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; } if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } } return smc_connect_fallback(smc, reason_code); } static void smc_conn_abort(struct smc_sock *smc, int local_first) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; bool lgr_valid = false; if (smc_conn_lgr_valid(conn)) lgr_valid = true; smc_conn_free(conn); if (local_first && lgr_valid) smc_lgr_cleanup_early(lgr); } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ smc_pnet_find_roce_resource(smc->clcsock->sk, ini); if (!ini->check_smcrv2 && !ini->ib_dev) return SMC_CLC_DECL_NOSMCRDEV; if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) return SMC_CLC_DECL_NOSMCRDEV; return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ smc_pnet_find_ism_resource(smc->clcsock->sk, ini); if (!ini->ism_dev[0]) return SMC_CLC_DECL_NOSMCDDEV; else ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]); return 0; } /* is chid unique for the ism devices that are already determined? */ static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini, int cnt) { int i = (!ini->ism_dev[0]) ? 1 : 0; for (; i < cnt; i++) if (ini->ism_chid[i] == chid) return false; return true; } /* determine possible V2 ISM devices (either without PNETID or with PNETID plus * PNETID matching net_device) */ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, struct smc_init_info *ini) { int rc = SMC_CLC_DECL_NOSMCDDEV; struct smcd_dev *smcd; int i = 1, entry = 1; bool is_emulated; u16 chid; if (smcd_indicated(ini->smc_type_v1)) rc = 0; /* already initialized for V1 */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->going_away || smcd == ini->ism_dev[0]) continue; chid = smc_ism_get_chid(smcd); if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) continue; is_emulated = __smc_ism_is_emulated(chid); if (!smc_pnet_is_pnetid_set(smcd->pnetid) || smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) /* It's the last GID-CHID entry left in CLC * Proposal SMC-Dv2 extension, but an Emulated- * ISM device will take two entries. So give * up it and try the next potential ISM device. */ continue; ini->ism_dev[i] = smcd; ini->ism_chid[i] = chid; ini->is_smcd = true; rc = 0; i++; entry = is_emulated ? entry + 2 : entry + 1; if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES) break; } } mutex_unlock(&smcd_dev_list.mutex); ini->ism_offered_cnt = i - 1; if (!ini->ism_dev[0] && !ini->ism_dev[1]) ini->smcd_version = 0; return rc; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_init_info *ini) { if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; return 0; } static int smc_find_proposal_devices(struct smc_sock *smc, struct smc_init_info *ini) { int rc = 0; /* check if there is an ism device available */ if (!(ini->smcd_version & SMC_V1) || smc_find_ism_device(smc, ini) || smc_connect_ism_vlan_setup(ini)) ini->smcd_version &= ~SMC_V1; /* else ISM V1 is supported for this connection */ /* check if there is an rdma device available */ if (!(ini->smcr_version & SMC_V1) || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V1; /* else RDMA is supported for this connection */ ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, ini->smcr_version & SMC_V1); /* check if there is an ism v2 device available */ if (!(ini->smcd_version & SMC_V2) || !smc_ism_is_v2_capable() || smc_find_ism_v2_device_clnt(smc, ini)) ini->smcd_version &= ~SMC_V2; /* check if there is an rdma v2 device available */ ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || #if IS_ENABLED(CONFIG_IPV6) (smc->clcsock->sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) || #endif !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; ini->check_smcrv2 = false; ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, ini->smcr_version & SMC_V2); /* if neither ISM nor RDMA are supported, fallback */ if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) rc = SMC_CLC_DECL_NOSMCDEV; return rc; } /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ static int smc_connect_ism_vlan_cleanup(struct smc_init_info *ini) { if (!smcd_indicated(ini->smc_type_v1)) return 0; if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } #define SMC_CLC_MAX_ACCEPT_LEN \ (sizeof(struct smc_clc_msg_accept_confirm) + \ sizeof(struct smc_clc_first_contact_ext_v2x) + \ sizeof(struct smc_clc_msg_trail)) /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; /* receive SMC Accept CLC message */ return smc_clc_wait_msg(smc, aclc, SMC_CLC_MAX_ACCEPT_LEN, SMC_CLC_ACCEPT, CLC_WAIT_TIME); } void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid) { struct smc_init_info *alt_ini = NULL; memset(gidlist, 0, sizeof(*gidlist)); memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); if (!alt_ini) goto out; alt_ini->vlan_id = lgr->vlan_id; alt_ini->check_smcrv2 = true; alt_ini->smcrv2.saddr = lgr->saddr; smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); if (!alt_ini->smcrv2.ib_dev_v2) goto out; memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); out: kfree(alt_ini); } static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(aclc, false); struct net *net = sock_net(&smc->sk); int rc; if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) return 0; if (fce->v2_direct) { memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); ini->smcrv2.uses_gateway = false; } else { if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr, smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), ini->smcrv2.nexthop_mac, &ini->smcrv2.uses_gateway)) return SMC_CLC_DECL_NOROUTE; if (!ini->smcrv2.uses_gateway) { /* mismatch: peer claims indirect, but its direct */ return SMC_CLC_DECL_NOINDIRECT; } } ini->release_nr = fce->release; rc = smc_clc_clnt_v2x_features_validate(fce, ini); if (rc) return rc; return 0; } /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i, reason_code = 0; struct smc_link *link; u8 *eid = NULL; ini->is_smcd = false; ini->ib_clcqpn = ntoh24(aclc->r0.qpn); ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); ini->max_conns = SMC_CONN_PER_LGR_MAX; ini->max_links = SMC_LINKS_ADD_LNK_MAX; reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); if (reason_code) return reason_code; mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); if (reason_code) { mutex_unlock(&smc_client_lgr_pending); return reason_code; } smc_conn_save_peer_info(smc, aclc); if (ini->first_contact_local) { link = smc->conn.lnk; } else { /* set link that was assigned by server */ link = NULL; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *l = &smc->conn.lgr->lnk[i]; if (l->peer_qpn == ntoh24(aclc->r0.qpn) && !memcmp(l->peer_gid, &aclc->r0.lcl.gid, SMC_GID_SIZE) && (aclc->hdr.version > SMC_V1 || !memcmp(l->peer_mac, &aclc->r0.lcl.mac, sizeof(l->peer_mac)))) { link = l; break; } } if (!link) { reason_code = SMC_CLC_DECL_NOSRVLINK; goto connect_abort; } smc_switch_link_and_count(&smc->conn, link); } /* create send buffer and rmb */ if (smc_buf_create(smc, false)) { reason_code = SMC_CLC_DECL_MEM; goto connect_abort; } if (ini->first_contact_local) smc_link_save_peer_info(link, aclc, ini); if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto connect_abort; } smc_rx_init(smc); if (ini->first_contact_local) { if (smc_ib_ready_link(link)) { reason_code = SMC_CLC_DECL_ERR_RDYLNK; goto connect_abort; } } else { /* reg sendbufs if they were vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } if (aclc->hdr.version > SMC_V1) { eid = aclc->r1.eid; if (ini->first_contact_local) smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, link->smcibdev, link->gid); } reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, aclc->hdr.version, eid, ini); if (reason_code) goto connect_abort; smc_tx_init(smc); if (ini->first_contact_local) { /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_clnt_conf_first_link(smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) goto connect_abort; } mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; return reason_code; } /* The server has chosen one of the proposed ISM devices for the communication. * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. */ static int smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } } return -EPROTO; } /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { u8 *eid = NULL; int rc = 0; ini->is_smcd = true; ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; if (aclc->hdr.version == SMC_V2) { if (ini->first_contact_peer) { struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(aclc, true); ini->release_nr = fce->release; rc = smc_clc_clnt_v2x_features_validate(fce, ini); if (rc) return rc; } rc = smc_v2_determine_accepted_chid(aclc, ini); if (rc) return rc; if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected])) ini->ism_peer_gid[ini->ism_selected].gid_ext = ntohll(aclc->d1.gid_ext); /* for non-Emulated-ISM devices, peer gid_ext remains 0. */ } ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid); /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); rc = smc_conn_create(smc, ini); if (rc) { mutex_unlock(&smc_server_lgr_pending); return rc; } /* Create send and receive buffers */ rc = smc_buf_create(smc, true); if (rc) { rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; goto connect_abort; } smc_conn_save_peer_info(smc, aclc); if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) { rc = smcd_buf_attach(smc); if (rc) { rc = SMC_CLC_DECL_MEM; /* try to fallback */ goto connect_abort; } } smc_rx_init(smc); smc_tx_init(smc); if (aclc->hdr.version > SMC_V1) eid = aclc->d1.eid; rc = smc_clc_send_confirm(smc, ini->first_contact_local, aclc->hdr.version, eid, ini); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_server_lgr_pending); smc->connect_nonblock = 0; return rc; } /* check if received accept type and version matches a proposed one */ static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { if (aclc->hdr.version >= SMC_V2) { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v2)) || (aclc->hdr.typev1 == SMC_TYPE_D && !smcd_indicated(ini->smc_type_v2))) return SMC_CLC_DECL_MODEUNSUPP; } else { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v1)) || (aclc->hdr.typev1 == SMC_TYPE_D && !smcd_indicated(ini->smc_type_v1))) return SMC_CLC_DECL_MODEUNSUPP; } return 0; } /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct smc_clc_msg_accept_confirm *aclc; struct smc_init_info *ini = NULL; u8 *buf = NULL; int rc = 0; if (smc->use_fallback) return smc_connect_fallback(smc, smc->fallback_rsn); /* if peer has not signalled SMC-capability, fall back */ if (!tcp_sk(smc->clcsock->sk)->syn_smc) return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, version); ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, version); ini->smcd_version = SMC_V1 | SMC_V2; ini->smcr_version = SMC_V1 | SMC_V2; ini->smc_type_v1 = SMC_TYPE_B; ini->smc_type_v2 = SMC_TYPE_B; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { ini->smcd_version &= ~SMC_V1; ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; } rc = smc_find_proposal_devices(smc, ini); if (rc) goto fallback; buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL); if (!buf) { rc = SMC_CLC_DECL_MEM; goto fallback; } aclc = (struct smc_clc_msg_accept_confirm *)buf; /* perform CLC handshake */ rc = smc_connect_clc(smc, aclc, ini); if (rc) { /* -EAGAIN on timeout, see tcp_recvmsg() */ if (rc == -EAGAIN) { rc = -ETIMEDOUT; smc->sk.sk_err = ETIMEDOUT; } goto vlan_cleanup; } /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; if (rc) goto vlan_cleanup; /* depending on previous steps, connect using rdma or ism */ if (aclc->hdr.typev1 == SMC_TYPE_R) { ini->smcr_version = version; rc = smc_connect_rdma(smc, aclc, ini); } else if (aclc->hdr.typev1 == SMC_TYPE_D) { ini->smcd_version = version; rc = smc_connect_ism(smc, aclc, ini); } if (rc) goto vlan_cleanup; SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); smc_connect_ism_vlan_cleanup(ini); kfree(buf); kfree(ini); return 0; vlan_cleanup: smc_connect_ism_vlan_cleanup(ini); kfree(buf); fallback: kfree(ini); return smc_connect_decline_fallback(smc, rc, version); } static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); long timeo = READ_ONCE(smc->sk.sk_sndtimeo); int rc = 0; if (!timeo) timeo = MAX_SCHEDULE_TIMEOUT; lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; } else if ((1 << smc->clcsock->sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); if ((rc == -EPIPE) && ((1 << smc->clcsock->sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) rc = 0; } release_sock(smc->clcsock->sk); lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; else if (rc == -ECONNREFUSED) smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ goto out; } rc = __smc_connect(smc); if (rc < 0) smc->sk.sk_err = -rc; out: if (!sock_flag(&smc->sk, SOCK_DEAD)) { if (smc->sk.sk_err) { smc->sk.sk_state_change(&smc->sk); } else { /* allow polling before and after fallback decision */ smc->clcsock->sk->sk_write_space(smc->clcsock->sk); smc->sk.sk_write_space(&smc->sk); } } release_sock(&smc->sk); } int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -EINVAL; smc = smc_sk(sk); /* separate smc parameter checking to be safe */ if (alen < sizeof(addr->sa_family)) goto out_err; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) goto out_err; lock_sock(sk); switch (sock->state) { default: rc = -EINVAL; goto out; case SS_CONNECTED: rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; goto out; case SS_CONNECTING: if (sk->sk_state == SMC_ACTIVE) goto connected; break; case SS_UNCONNECTED: sock->state = SS_CONNECTING; break; } switch (sk->sk_state) { default: goto out; case SMC_CLOSED: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; case SMC_INIT: break; } smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { rc = -EALREADY; goto out; } rc = kernel_connect(smc->clcsock, (struct sockaddr_unsized *)addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; if (smc->use_fallback) { sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; } sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; } connected: rc = 0; sock->state = SS_CONNECTED; out: release_sock(sk); out_err: return rc; } static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) { struct socket *new_clcsock = NULL; struct sock *lsk = &lsmc->sk; struct sock *new_sk; int rc = -EINVAL; release_sock(lsk); new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; *new_smc = NULL; lock_sock(lsk); goto out; } *new_smc = smc_sk(new_sk); mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); lock_sock(lsk); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; smc_sock_set_flag(new_sk, SOCK_DEAD); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; } /* new clcsock has inherited the smc listen-specific sk_data_ready * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; /* if new clcsock has also inherited the fallback-specific callback * functions, switch them back to the original ones. */ if (lsmc->use_fallback) { if (lsmc->clcsk_state_change) new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; if (lsmc->clcsk_write_space) new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; if (lsmc->clcsk_error_report) new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; } (*new_smc)->clcsock = new_clcsock; out: return rc; } /* add a just created sock to the accept queue of the listen sock as * candidate for a following socket accept call from user space */ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) { struct smc_sock *par = smc_sk(parent); sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_added(parent); } /* remove a socket from the accept queue of its parental listening socket */ static void smc_accept_unlink(struct sock *sk) { struct smc_sock *par = smc_sk(sk)->listen_smc; spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock) { struct smc_sock *isk, *n; struct sock *new_sk; list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { new_sk = (struct sock *)isk; smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } sock_put(new_sk); /* final */ continue; } if (new_sock) { sock_graft(new_sk, new_sock); new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; } } return new_sk; } return NULL; } /* clean up for a created but never accepted sock */ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT); __smc_release(smc); release_sock(sk); sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; struct smc_llc_qentry *qentry; int rc; /* reg the sndbuf if it was vzalloced*/ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } /* reg the rmb */ if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) return SMC_CLC_DECL_RMBE_EC; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); if (link->lgr->max_links > 1) { down_write(&link->lgr->llc_conf_mutex); /* initial contact - try to establish second link */ smc_llc_srv_add_link(link, NULL); up_write(&link->lgr->llc_conf_mutex); } return 0; } /* listen worker: finish */ static void smc_listen_out(struct smc_sock *new_smc) { struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); release_sock(newsmcsk); /* lock in smc_listen_work() */ if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); release_sock(&lsmc->sk); } else { /* no longer listening */ smc_close_non_accepted(newsmcsk); } /* Wake up accept */ lsmc->sk.sk_data_ready(&lsmc->sk); sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ } /* listen worker: finish in state connected */ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; smc_listen_out(new_smc); } /* listen worker: finish in error state */ static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; struct net *net = sock_net(newsmcsk); this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; smc_listen_out(new_smc); } /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ smc_conn_abort(new_smc, local_first); if (reason_code < 0 || smc_switch_to_fallback(new_smc, reason_code)) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); return; } } smc_listen_out_connected(new_smc); } /* listen worker: version checking */ static int smc_listen_v2_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; struct smc_clc_v2_extension *pclc_v2_ext; int rc = SMC_CLC_DECL_PEERNOSMC; ini->smc_type_v1 = pclc->hdr.typev1; ini->smc_type_v2 = pclc->hdr.typev2; ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; if (pclc->hdr.version > SMC_V1) { if (smcd_indicated(ini->smc_type_v2)) ini->smcd_version |= SMC_V2; if (smcr_indicated(ini->smc_type_v2)) ini->smcr_version |= SMC_V2; } if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { rc = SMC_CLC_DECL_PEERNOSMC; goto out; } pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) { ini->smcd_version &= ~SMC_V2; ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2EXT; goto out; } pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); if (ini->smcd_version & SMC_V2) { if (!smc_ism_is_v2_capable()) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOISM2SUPP; } else if (!pclc_smcd_v2_ext) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2DEXT; } else if (!pclc_v2_ext->hdr.eid_cnt && !pclc_v2_ext->hdr.flag.seid) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOUEID; } } if (ini->smcr_version & SMC_V2) { if (!pclc_v2_ext->hdr.eid_cnt) { ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOUEID; } } ini->release_nr = pclc_v2_ext->hdr.flag.release; if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE) ini->release_nr = SMC_RELEASE; out: if (!ini->smcd_version && !ini->smcr_version) return rc; return 0; } /* listen worker: check prefixes */ static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx) return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, struct smc_init_info *ini) { int rc; /* allocate connection / link group */ rc = smc_conn_create(new_smc, ini); if (rc) return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) { smc_conn_abort(new_smc, ini->first_contact_local); return SMC_CLC_DECL_MEM; } return 0; } /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_init_info *ini) { int rc; rc = smc_conn_create(new_smc, ini); if (rc) return rc; /* Create send and receive buffers */ rc = smc_buf_create(new_smc, true); if (rc) { smc_conn_abort(new_smc, ini->first_contact_local); return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; } return 0; } static bool smc_is_already_selected(struct smcd_dev *smcd, struct smc_init_info *ini, int matches) { int i; for (i = 0; i < matches; i++) if (smcd == ini->ism_dev[i]) return true; return false; } /* check for ISM devices matching proposed ISM devices */ static void smc_check_ism_v2_match(struct smc_init_info *ini, u16 proposed_chid, struct smcd_gid *proposed_gid, unsigned int *matches) { struct smcd_dev *smcd; list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->going_away) continue; if (smc_is_already_selected(smcd, ini, *matches)) continue; if (smc_ism_get_chid(smcd) == proposed_chid && !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { ini->ism_peer_gid[*matches].gid = proposed_gid->gid; if (__smc_ism_is_emulated(proposed_chid)) ini->ism_peer_gid[*matches].gid_ext = proposed_gid->gid_ext; /* non-Emulated-ISM's peer gid_ext remains 0. */ ini->ism_dev[*matches] = smcd; (*matches)++; break; } } } static void smc_init_info_store_rc(u32 rc, struct smc_init_info *ini) { if (!ini->rc) ini->rc = rc; } static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *smcd_v2_ext; struct smc_clc_v2_extension *smc_v2_ext; struct smc_clc_msg_smcd *pclc_smcd; unsigned int matches = 0; struct smcd_gid smcd_gid; u8 smcd_version; u8 *eid = NULL; int i, rc; u16 chid; if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) goto not_found; pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) goto not_found; mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { /* check for ISM device matching proposed native ISM device */ smcd_gid.gid = ntohll(pclc_smcd->ism.gid); smcd_gid.gid_ext = 0; smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), &smcd_gid, &matches); } for (i = 0; i < smc_v2_ext->hdr.ism_gid_cnt; i++) { /* check for ISM devices matching proposed non-native ISM * devices */ smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid); smcd_gid.gid_ext = 0; chid = ntohs(smcd_v2_ext->gidchid[i].chid); if (__smc_ism_is_emulated(chid)) { if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt || chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid)) /* each Emulated-ISM device takes two GID-CHID * entries and CHID of the second entry repeats * that of the first entry. * * So check if the next GID-CHID entry exists * and both two entries' CHIDs are the same. */ continue; smcd_gid.gid_ext = ntohll(smcd_v2_ext->gidchid[++i].gid); } smc_check_ism_v2_match(ini, chid, &smcd_gid, &matches); } mutex_unlock(&smcd_dev_list.mutex); if (!ini->ism_dev[0]) { smc_init_info_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; } smc_ism_get_system_eid(&eid); if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, smcd_v2_ext->system_eid, eid)) goto not_found; /* separate - outside the smcd_dev_list.lock */ smcd_version = ini->smcd_version; for (i = 0; i < matches; i++) { ini->smcd_version = SMC_V2; ini->is_smcd = true; ini->ism_selected = i; rc = smc_listen_ism_init(new_smc, ini); if (rc) { smc_init_info_store_rc(rc, ini); /* try next active ISM device */ continue; } return; /* matching and usable V2 ISM device found */ } /* no V2 ISM device could be initialized */ ini->smcd_version = smcd_version; /* restore original value */ ini->negotiated_eid[0] = 0; not_found: ini->smcd_version &= ~SMC_V2; ini->ism_dev[0] = NULL; ini->is_smcd = false; } static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); int rc = 0; /* check if ISM V1 is available */ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1) || !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); ini->ism_peer_gid[0].gid_ext = 0; rc = smc_find_ism_device(new_smc, ini); if (rc) goto not_found; ini->ism_selected = 0; rc = smc_listen_ism_init(new_smc, ini); if (!rc) return; /* V1 ISM device found */ not_found: smc_init_info_store_rc(rc, ini); ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; } /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) { struct smc_connection *conn = &new_smc->conn; if (!local_first) { /* reg sendbufs if they were vzalloced */ if (conn->sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(conn->lnk, conn->sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; } return 0; } static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_v2_extension *smc_v2_ext; u8 smcr_version; int rc; if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); if (!smc_v2_ext || !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); ini->check_smcrv2 = true; ini->smcrv2.clc_sk = new_smc->clcsock->sk; ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); if (rc) { smc_init_info_store_rc(rc, ini); goto not_found; } if (!ini->smcrv2.uses_gateway) memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); if (!rc) { rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); if (rc) smc_conn_abort(new_smc, ini->first_contact_local); } if (!rc) return; ini->smcr_version = smcr_version; smc_init_info_store_rc(rc, ini); not_found: ini->smcr_version &= ~SMC_V2; ini->smcrv2.ib_dev_v2 = NULL; ini->check_smcrv2 = false; } static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int rc; if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) return SMC_CLC_DECL_NOSMCDEV; /* prepare RDMA check */ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); rc = smc_find_rdma_device(new_smc, ini); if (rc) { /* no RDMA device found */ return SMC_CLC_DECL_NOSMCDEV; } rc = smc_listen_rdma_init(new_smc, ini); if (rc) return rc; return smc_listen_rdma_reg(new_smc, ini->first_contact_local); } /* determine the local device matching to proposal */ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int prfx_rc; /* check for ISM device matching V2 proposed device */ smc_find_ism_v2_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; /* check for matching IP prefix and subnet length (V1) */ prfx_rc = smc_listen_prfx_check(new_smc, pclc); if (prfx_rc) smc_init_info_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) return ini->rc ?: SMC_CLC_DECL_GETVLANERR; /* check for ISM device matching V1 proposed device */ if (!prfx_rc) smc_find_ism_v1_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; if (!smcr_indicated(pclc->hdr.typev1) && !smcr_indicated(pclc->hdr.typev2)) /* skip RDMA and decline */ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; /* check if RDMA V2 is available */ smc_find_rdma_v2_device_serv(new_smc, pclc, ini); if (ini->smcrv2.ib_dev_v2) return 0; /* check if RDMA V1 is available */ if (!prfx_rc) { int rc; rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); smc_init_info_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; } return prfx_rc; } /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, bool local_first, struct smc_init_info *ini) { struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_first) smc_link_save_peer_info(link, cclc, ini); if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) return SMC_CLC_DECL_ERR_RTOK; if (local_first) { if (smc_ib_ready_link(link)) return SMC_CLC_DECL_ERR_RDYLNK; /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_serv_conf_first_link(new_smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); } return reason_code; } /* setup for connection of server */ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm *cclc; struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; struct smc_init_info *ini = NULL; u8 proposal_version = SMC_V1; u8 accept_version; int rc = 0; lock_sock(&new_smc->sk); /* release in smc_listen_out() */ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) return smc_listen_out_err(new_smc); if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); return; } /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); if (rc) smc_listen_out_err(new_smc); else smc_listen_out_connected(new_smc); return; } /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) { rc = SMC_CLC_DECL_MEM; goto out_decl; } pclc = (struct smc_clc_msg_proposal *)buf; rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (rc) goto out_decl; if (pclc->hdr.version > SMC_V1) proposal_version = SMC_V2; /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(new_smc)) { rc = SMC_CLC_DECL_IPSEC; goto out_decl; } ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) { rc = SMC_CLC_DECL_MEM; goto out_decl; } /* initial version checking */ rc = smc_listen_v2_check(new_smc, pclc, ini); if (rc) goto out_decl; rc = smc_clc_srv_v2x_features_validate(new_smc, pclc, ini); if (rc) goto out_decl; mutex_lock(&smc_server_lgr_pending); smc_rx_init(new_smc); smc_tx_init(new_smc); /* determine ISM or RoCE device used for connection */ rc = smc_listen_find_device(new_smc, pclc, ini); if (rc) goto out_unlock; /* send SMC Accept CLC message */ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, accept_version, ini->negotiated_eid, ini); if (rc) goto out_unlock; /* SMC-D does not need this lock any more */ if (ini->is_smcd) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ memset(buf, 0, sizeof(*buf)); cclc = (struct smc_clc_msg_accept_confirm *)buf; rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), SMC_CLC_CONFIRM, CLC_WAIT_TIME); if (rc) { if (!ini->is_smcd) goto out_unlock; goto out_decl; } rc = smc_clc_v2x_features_confirm_check(cclc, ini); if (rc) { if (!ini->is_smcd) goto out_unlock; goto out_decl; } /* fce smc release version is needed in smc_listen_rdma_finish, * so save fce info here. */ smc_conn_save_peer_info_fce(new_smc, cclc); /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, ini->first_contact_local, ini); if (rc) goto out_unlock; mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); if (ini->is_smcd && smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) { rc = smcd_buf_attach(new_smc); if (rc) goto out_decl; } SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); /* smc_listen_out() will release smcsk */ smc_listen_out_connected(new_smc); goto out_free; out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, proposal_version); out_free: kfree(ini); kfree(buf); } static void smc_tcp_listen_work(struct work_struct *work) { struct smc_sock *lsmc = container_of(work, struct smc_sock, tcp_listen_work); struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) /* clcsock accept queue empty or error */ goto out; if (!new_smc) continue; if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_inc(&lsmc->queued_smc_hs); new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); } out: release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } static void smc_clcsock_data_ready(struct sock *listen_clcsock) { struct smc_sock *lsmc; read_lock_bh(&listen_clcsock->sk_callback_lock); lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } out: read_unlock_bh(&listen_clcsock->sk_callback_lock); } int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); lock_sock(sk); rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; if (sk->sk_state == SMC_LISTEN) { sk->sk_max_ack_backlog = backlog; goto out; } /* some socket options are handled in core, so we could not apply * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); if (!smc->use_fallback) tcp_sk(smc->clcsock->sk)->syn_smc = 1; /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; smc->af_ops = *smc->ori_af_ops; smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; if (smc->limit_smc_hs) tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; out: release_sock(sk); return rc; } int smc_accept(struct socket *sock, struct socket *new_sock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); struct smc_sock *lsmc; long timeo; int rc = 0; lsmc = smc_sk(sk); sock_hold(sk); /* sock_put below */ lock_sock(sk); if (lsmc->sk.sk_state != SMC_LISTEN) { rc = -EINVAL; release_sock(sk); goto out; } /* Wait for an incoming connection */ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { rc = -EAGAIN; break; } release_sock(sk); timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ sched_annotate_sleep(); lock_sock(sk); if (signal_pending(current)) { rc = sock_intr_errno(timeo); break; } } set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); if (!rc) rc = sock_error(nsk); release_sock(sk); if (rc) goto out; if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ timeo = secs_to_jiffies(lsmc->sockopt_defer_accept); if (smc_sk(nsk)->use_fallback) { struct sock *clcsk = smc_sk(nsk)->clcsock->sk; lock_sock(clcsk); if (skb_queue_empty(&clcsk->sk_receive_queue)) sk_wait_data(clcsk, &timeo, NULL); release_sock(clcsk); } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { lock_sock(nsk); smc_rx_wait(smc_sk(nsk), &timeo, 0, smc_rx_data_available); release_sock(nsk); } } out: sock_put(sk); /* sock_hold above */ return rc; } int smc_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct smc_sock *smc; if (peer && (sock->sk->sk_state != SMC_ACTIVE) && (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) return -ENOTCONN; smc = smc_sk(sock->sk); return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); lock_sock(sk); /* SMC does not support connect with fastopen */ if (msg->msg_flags & MSG_FASTOPEN) { /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); if (rc) goto out; } else { rc = -EINVAL; goto out; } } else if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_INIT)) { rc = -EPIPE; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); } else { rc = smc_tx_sendmsg(smc, msg, len); SMC_STAT_TX_PAYLOAD(smc, len, rc); } out: release_sock(sk); return rc; } int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -ENOTCONN; smc = smc_sk(sk); lock_sock(sk); if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { /* socket was connected before, no more data to read */ rc = 0; goto out; } if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || (sk->sk_state == SMC_CLOSED)) goto out; if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { rc = 0; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); } else { msg->msg_namelen = 0; rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); SMC_STAT_RX_PAYLOAD(smc, rc, rc); } out: release_sock(sk); return rc; } static __poll_t smc_accept_poll(struct sock *parent) { struct smc_sock *isk = smc_sk(parent); __poll_t mask = 0; spin_lock(&isk->accept_q_lock); if (!list_empty(&isk->accept_q)) mask = EPOLLIN | EPOLLRDNORM; spin_unlock(&isk->accept_q_lock); return mask; } __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct smc_sock *smc; __poll_t mask = 0; if (!sk) return EPOLLNVAL; smc = smc_sk(sock->sk); if (smc->use_fallback) { /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ mask |= smc_accept_poll(sk); } else if (smc->use_fallback) { /* as result of connect_work()*/ mask |= smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; } else { if ((sk->sk_state != SMC_INIT && atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (sk->sk_state != SMC_INIT) { /* Race breaker the same way as tcp_poll(). */ smp_mb__after_atomic(); if (atomic_read(&smc->conn.sndbuf_space)) mask |= EPOLLOUT | EPOLLWRNORM; } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; if (smc->conn.urg_state == SMC_URG_VALID) mask |= EPOLLPRI; } } return mask; } int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; bool do_shutdown = true; struct smc_sock *smc; int rc = -EINVAL; int old_state; int rc1 = 0; smc = smc_sk(sk); if ((how < SHUT_RD) || (how > SHUT_RDWR)) return rc; lock_sock(sk); if (sock->state == SS_CONNECTING) { if (sk->sk_state == SMC_ACTIVE) sock->state = SS_CONNECTED; else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || sk->sk_state == SMC_PEERCLOSEWAIT2 || sk->sk_state == SMC_APPCLOSEWAIT1 || sk->sk_state == SMC_APPCLOSEWAIT2 || sk->sk_state == SMC_APPFINCLOSEWAIT) sock->state = SS_DISCONNECTING; } rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && (sk->sk_state != SMC_PEERCLOSEWAIT2) && (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_APPCLOSEWAIT2) && (sk->sk_state != SMC_APPFINCLOSEWAIT)) goto out; if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; sk->sk_socket->state = SS_UNCONNECTED; sock_put(sk); } goto out; } switch (how) { case SHUT_RDWR: /* shutdown in both directions */ old_state = sk->sk_state; rc = smc_close_active(smc); if (old_state == SMC_ACTIVE && sk->sk_state == SMC_PEERCLOSEWAIT1) do_shutdown = false; break; case SHUT_WR: rc = smc_close_shutdown_write(smc); break; case SHUT_RD: rc = 0; /* nothing more to do because peer is not involved */ break; } if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; if (sk->sk_state == SMC_CLOSED) sock->state = SS_UNCONNECTED; else sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; } static int __smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; int val, len; smc = smc_sk(sock->sk); if (get_user(len, optlen)) return -EFAULT; len = min_t(int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case SMC_LIMIT_HS: val = smc->limit_smc_hs; break; default: return -EOPNOTSUPP; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int __smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; int val, rc; smc = smc_sk(sk); lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: if (optlen < sizeof(int)) { rc = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(int))) { rc = -EFAULT; break; } smc->limit_smc_hs = !!val; rc = 0; break; default: rc = -EOPNOTSUPP; break; } release_sock(sk); return rc; } int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; int val, rc; if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; else if (level == SOL_SMC) return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; sk_error_report(sk); } mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); if (rc || smc->use_fallback) goto out; switch (optname) { case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } break; case TCP_NODELAY: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (val) { SMC_STAT_INC(smc, ndly_cnt); smc_tx_pending(&smc->conn); cancel_delayed_work(&smc->conn.tx_work); } } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (!val) { SMC_STAT_INC(smc, cork_cnt); smc_tx_pending(&smc->conn); cancel_delayed_work(&smc->conn.tx_work); } } break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; default: break; } out: release_sock(sk); return rc; } int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; int rc; if (level == SOL_SMC) return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) { mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); mutex_unlock(&smc->clcsock_release_lock); return rc; } int smc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { union smc_host_cursor cons, urg; struct smc_connection *conn; struct smc_sock *smc; int answ; smc = smc_sk(sock->sk); conn = &smc->conn; lock_sock(&smc->sk); if (smc->use_fallback) { if (!smc->clcsock) { release_sock(&smc->sk); return -EBADF; } answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); release_sock(&smc->sk); return answ; } switch (cmd) { case SIOCINQ: /* same as FIONREAD */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = atomic_read(&smc->conn.bytes_to_rcv); break; case SIOCOUTQ: /* output queue size (not send + not acked) */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = smc->conn.sndbuf_desc->len - atomic_read(&smc->conn.sndbuf_space); break; case SIOCOUTQNSD: /* output queue size (not send only) */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = smc_tx_prepared_sends(&smc->conn); break; case SIOCATMARK: if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) { answ = 0; } else { smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_curs_copy(&urg, &conn->urg_curs, conn); answ = smc_curs_diff(conn->rmb_desc->len, &cons, &urg) == 1; } break; default: release_sock(&smc->sk); return -ENOIOCTLCMD; } release_sock(&smc->sk); return put_user(answ, (int __user *)arg); } /* Map the affected portions of the rmbe into an spd, note the number of bytes * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor * updates till whenever a respective page has been fully processed. * Note that subsequent recv() calls have to wait till all splice() processing * completed. */ ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -ENOTCONN; smc = smc_sk(sk); lock_sock(sk); if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { /* socket was connected before, no more data to read */ rc = 0; goto out; } if (sk->sk_state == SMC_INIT || sk->sk_state == SMC_LISTEN || sk->sk_state == SMC_CLOSED) goto out; if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { rc = 0; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, pipe, len, flags); } else { if (*ppos) { rc = -ESPIPE; goto out; } if (flags & SPLICE_F_NONBLOCK) flags = MSG_DONTWAIT; else flags = 0; SMC_STAT_INC(smc, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: release_sock(sk); return rc; } /* must look like tcp */ static const struct proto_ops smc_sock_ops = { .family = PF_SMC, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; int smc_create_clcsk(struct net *net, struct sock *sk, int family) { struct smc_sock *smc = smc_sk(sk); int rc; rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); if (rc) return rc; /* smc_clcsock_release() does not wait smc->clcsock->sk's * destruction; its sk_state might not be TCP_CLOSE after * smc->sk is close()d, and TCP timers can be fired later, * which need net ref. */ sk = smc->clcsock->sk; sk_net_refcnt_upgrade(sk); return 0; } static int __smc_create(struct net *net, struct socket *sock, int protocol, int kern, struct socket *clcsock) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; struct sock *sk; int rc; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_STREAM) goto out; rc = -EPROTONOSUPPORT; if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) goto out; rc = -ENOBUFS; sock->ops = &smc_sock_ops; sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); rc = 0; if (clcsock) smc->clcsock = clcsock; else rc = smc_create_clcsk(net, sk, family); if (rc) { sk_common_release(sk); sock->sk = NULL; } out: return rc; } static int smc_create(struct net *net, struct socket *sock, int protocol, int kern) { return __smc_create(net, sock, protocol, kern, NULL); } static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; static int smc_ulp_init(struct sock *sk) { struct socket *tcp = sk->sk_socket; struct net *net = sock_net(sk); struct socket *smcsock; int protocol, ret; /* only TCP can be replaced */ if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) return -ESOCKTNOSUPPORT; /* don't handle wq now */ if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) return -ENOTCONN; if (sk->sk_family == AF_INET) protocol = SMCPROTO_SMC; else protocol = SMCPROTO_SMC6; smcsock = sock_alloc(); if (!smcsock) return -ENFILE; smcsock->type = SOCK_STREAM; __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ ret = __smc_create(net, smcsock, protocol, 1, tcp); if (ret) { sock_release(smcsock); /* module_put() which ops won't be NULL */ return ret; } /* replace tcp socket to smc */ smcsock->file = tcp->file; smcsock->file->private_data = smcsock; smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ tcp->file = NULL; return ret; } static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); /* don't inherit ulp ops to child when listen */ icsk->icsk_ulp_ops = NULL; } static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { .name = "smc", .owner = THIS_MODULE, .init = smc_ulp_init, .clone = smc_ulp_clone, }; unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { int rc; rc = smc_sysctl_net_init(net); if (rc) return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } static __net_init int smc_net_stat_init(struct net *net) { return smc_stats_init(net); } static void __net_exit smc_net_stat_exit(struct net *net) { smc_stats_exit(net); } static struct pernet_operations smc_net_ops = { .init = smc_net_init, .exit = smc_net_exit, .id = &smc_net_id, .size = sizeof(struct smc_net), }; static struct pernet_operations smc_net_stat_ops = { .init = smc_net_stat_init, .exit = smc_net_stat_exit, }; static int __init smc_init(void) { int rc; rc = register_pernet_subsys(&smc_net_ops); if (rc) return rc; rc = register_pernet_subsys(&smc_net_stat_ops); if (rc) goto out_pernet_subsys; rc = smc_ism_init(); if (rc) goto out_pernet_subsys_stat; smc_clc_init(); rc = smc_nl_init(); if (rc) goto out_ism; rc = smc_pnet_init(); if (rc) goto out_nl; rc = -ENOMEM; smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_PERCPU, 0); if (!smc_tcp_ls_wq) goto out_pnet; smc_hs_wq = alloc_workqueue("smc_hs_wq", WQ_PERCPU, 0); if (!smc_hs_wq) goto out_alloc_tcp_ls_wq; smc_close_wq = alloc_workqueue("smc_close_wq", WQ_PERCPU, 0); if (!smc_close_wq) goto out_alloc_hs_wq; rc = smc_core_init(); if (rc) { pr_err("%s: smc_core_init fails with %d\n", __func__, rc); goto out_alloc_wqs; } rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); goto out_core; } rc = proto_register(&smc_proto6, 1); if (rc) { pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); goto out_proto; } rc = sock_register(&smc_sock_family_ops); if (rc) { pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); goto out_sock; } rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); goto out_ib; } rc = smc_inet_init(); if (rc) { pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); goto out_ulp; } rc = bpf_smc_hs_ctrl_init(); if (rc) { pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__, rc); goto out_inet; } static_branch_enable(&tcp_have_smc); return 0; out_inet: smc_inet_exit(); out_ulp: tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); out_core: smc_core_exit(); out_alloc_wqs: destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); out_alloc_tcp_ls_wq: destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: smc_nl_exit(); out_ism: smc_clc_exit(); smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); return rc; } static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); smc_inet_exit(); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); smc_nl_exit(); smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } module_init(smc_init); module_exit(smc_exit); MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); MODULE_ALIAS_TCP_ULP("smc"); /* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1); #if IS_ENABLED(CONFIG_IPV6) MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 256, 1); #endif /* CONFIG_IPV6 */ MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);
460 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KASAN_H #define _LINUX_KASAN_H #include <linux/bug.h> #include <linux/kasan-enabled.h> #include <linux/kasan-tags.h> #include <linux/kernel.h> #include <linux/static_key.h> #include <linux/types.h> struct kmem_cache; struct page; struct slab; struct vm_struct; struct task_struct; #ifdef CONFIG_KASAN #include <linux/linkage.h> #include <asm/kasan.h> #endif typedef unsigned int __bitwise kasan_vmalloc_flags_t; #define KASAN_VMALLOC_NONE ((__force kasan_vmalloc_flags_t)0x00u) #define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) #define KASAN_VMALLOC_KEEP_TAG ((__force kasan_vmalloc_flags_t)0x08u) #define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */ #define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include <linux/pgtable.h> /* Software KASAN implementations use shadow memory. */ #ifdef CONFIG_KASAN_SW_TAGS /* This matches KASAN_TAG_INVALID. */ #define KASAN_SHADOW_INIT 0xFE #else #define KASAN_SHADOW_INIT 0 #endif #ifndef PTE_HWTABLE_PTRS #define PTE_HWTABLE_PTRS 0 #endif extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]; extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD]; extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD]; extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); #ifndef kasan_mem_to_shadow static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET; } #endif int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); /* Enable reporting bugs after kasan_disable_current() */ extern void kasan_enable_current(void); /* Disable reporting bugs for current task */ extern void kasan_disable_current(void); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline int kasan_add_zero_shadow(void *start, unsigned long size) { return 0; } static inline void kasan_remove_zero_shadow(void *start, unsigned long size) {} static inline void kasan_enable_current(void) {} static inline void kasan_disable_current(void) {} #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ #ifdef CONFIG_KASAN_HW_TAGS #else /* CONFIG_KASAN_HW_TAGS */ #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) { return kasan_hw_tags_enabled(); } #ifdef CONFIG_KASAN void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { if (kasan_enabled()) __kasan_unpoison_range(addr, size); } void __kasan_poison_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) __kasan_poison_pages(page, order, init); } bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) return __kasan_unpoison_pages(page, order, init); return false; } void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { if (kasan_enabled()) __kasan_poison_slab(slab); } void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object); /** * kasan_unpoison_new_object - Temporarily unpoison a new slab object. * @cache: Cache the object belong to. * @object: Pointer to the object. * * This function is intended for the slab allocator's internal use. It * temporarily unpoisons an object from a newly allocated slab without doing * anything else. The object must later be repoisoned by * kasan_poison_new_object(). */ static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) __kasan_unpoison_new_object(cache, object); } void __kasan_poison_new_object(struct kmem_cache *cache, void *object); /** * kasan_poison_new_object - Repoison a new slab object. * @cache: Cache the object belong to. * @object: Pointer to the object. * * This function is intended for the slab allocator's internal use. It * repoisons an object that was previously unpoisoned by * kasan_unpoison_new_object() without doing anything else. */ static __always_inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) __kasan_poison_new_object(cache, object); } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object); static __always_inline void * __must_check kasan_init_slab_obj( struct kmem_cache *cache, const void *object) { if (kasan_enabled()) return __kasan_init_slab_obj(cache, object); return (void *)object; } bool __kasan_slab_pre_free(struct kmem_cache *s, void *object, unsigned long ip); /** * kasan_slab_pre_free - Check whether freeing a slab object is safe. * @object: Object to be freed. * * This function checks whether freeing the given object is safe. It may * check for double-free and invalid-free bugs and report them. * * This function is intended only for use by the slab allocator. * * @Return true if freeing the object is unsafe; false otherwise. */ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) { if (kasan_enabled()) return __kasan_slab_pre_free(s, object, _RET_IP_); return false; } bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, bool still_accessible, bool no_quarantine); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. * @init: Whether to initialize the object. * @still_accessible: Whether the object contents are still accessible. * * This function informs that a slab object has been freed and is not * supposed to be accessed anymore, except when @still_accessible is set * (indicating that the object is in a SLAB_TYPESAFE_BY_RCU cache and an RCU * grace period might not have passed yet). * * For KASAN modes that have integrated memory initialization * (kasan_has_integrated_init() == true), this function also initializes * the object's memory. For other modes, the @init argument is ignored. * * This function might also take ownership of the object to quarantine it. * When this happens, KASAN will defer freeing the object to a later * stage and handle it internally until then. The return value indicates * whether KASAN took ownership of the object. * * This function is intended only for use by the slab allocator. * * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init, bool still_accessible, bool no_quarantine) { if (kasan_enabled()) return __kasan_slab_free(s, object, init, still_accessible, no_quarantine); return false; } void __kasan_kfree_large(void *ptr, unsigned long ip); static __always_inline void kasan_kfree_large(void *ptr) { if (kasan_enabled()) __kasan_kfree_large(ptr, _RET_IP_); } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( struct kmem_cache *s, void *object, gfp_t flags, bool init) { if (kasan_enabled()) return __kasan_slab_alloc(s, object, flags, init); return object; } void * __must_check __kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags); static __always_inline void * __must_check kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags) { if (kasan_enabled()) return __kasan_kmalloc(s, object, size, flags); return (void *)object; } void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); static __always_inline void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { if (kasan_enabled()) return __kasan_kmalloc_large(ptr, size, flags); return (void *)ptr; } void * __must_check __kasan_krealloc(const void *object, size_t new_size, gfp_t flags); static __always_inline void * __must_check kasan_krealloc(const void *object, size_t new_size, gfp_t flags) { if (kasan_enabled()) return __kasan_krealloc(object, new_size, flags); return (void *)object; } bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, unsigned long ip); /** * kasan_mempool_poison_pages - Check and poison a mempool page allocation. * @page: Pointer to the page allocation. * @order: Order of the allocation. * * This function is intended for kernel subsystems that cache page allocations * to reuse them instead of freeing them back to page_alloc (e.g. mempool). * * This function is similar to kasan_mempool_poison_object() but operates on * page allocations. * * Before the poisoned allocation can be reused, it must be unpoisoned via * kasan_mempool_unpoison_pages(). * * Return: true if the allocation can be safely reused; false otherwise. */ static __always_inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order) { if (kasan_enabled()) return __kasan_mempool_poison_pages(page, order, _RET_IP_); return true; } void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, unsigned long ip); /** * kasan_mempool_unpoison_pages - Unpoison a mempool page allocation. * @page: Pointer to the page allocation. * @order: Order of the allocation. * * This function is intended for kernel subsystems that cache page allocations * to reuse them instead of freeing them back to page_alloc (e.g. mempool). * * This function unpoisons a page allocation that was previously poisoned by * kasan_mempool_poison_pages() without zeroing the allocation's memory. For * the tag-based modes, this function assigns a new tag to the allocation. */ static __always_inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) { if (kasan_enabled()) __kasan_mempool_unpoison_pages(page, order, _RET_IP_); } bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); /** * kasan_mempool_poison_object - Check and poison a mempool slab allocation. * @ptr: Pointer to the slab allocation. * * This function is intended for kernel subsystems that cache slab allocations * to reuse them instead of freeing them back to the slab allocator (e.g. * mempool). * * This function poisons a slab allocation and saves a free stack trace for it * without initializing the allocation's memory and without putting it into the * quarantine (for the Generic mode). * * This function also performs checks to detect double-free and invalid-free * bugs and reports them. The caller can use the return value of this function * to find out if the allocation is buggy. * * Before the poisoned allocation can be reused, it must be unpoisoned via * kasan_mempool_unpoison_object(). * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the * size > KMALLOC_MAX_SIZE). * * Return: true if the allocation can be safely reused; false otherwise. */ static __always_inline bool kasan_mempool_poison_object(void *ptr) { if (kasan_enabled()) return __kasan_mempool_poison_object(ptr, _RET_IP_); return true; } void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); /** * kasan_mempool_unpoison_object - Unpoison a mempool slab allocation. * @ptr: Pointer to the slab allocation. * @size: Size to be unpoisoned. * * This function is intended for kernel subsystems that cache slab allocations * to reuse them instead of freeing them back to the slab allocator (e.g. * mempool). * * This function unpoisons a slab allocation that was previously poisoned via * kasan_mempool_poison_object() and saves an alloc stack trace for it without * initializing the allocation's memory. For the tag-based modes, this function * does not assign a new tag to the allocation and instead restores the * original tags based on the pointer value. * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the * size > KMALLOC_MAX_SIZE). */ static __always_inline void kasan_mempool_unpoison_object(void *ptr, size_t size) { if (kasan_enabled()) __kasan_mempool_unpoison_object(ptr, size, _RET_IP_); } /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. */ bool __kasan_check_byte(const void *addr, unsigned long ip); static __always_inline bool kasan_check_byte(const void *addr) { if (kasan_enabled()) return __kasan_check_byte(addr, _RET_IP_); return true; } #else /* CONFIG_KASAN */ static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { return false; } static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) {} static inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) {} static inline void *kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { return (void *)object; } static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) { return false; } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init, bool still_accessible, bool no_quarantine) { return false; } static inline void kasan_kfree_large(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { return object; } static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags) { return (void *)object; } static inline void *kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { return (void *)ptr; } static inline void *kasan_krealloc(const void *object, size_t new_size, gfp_t flags) { return (void *)object; } static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order) { return true; } static inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) {} static inline bool kasan_mempool_poison_object(void *ptr) { return true; } static inline void kasan_mempool_unpoison_object(void *ptr, size_t size) {} static inline bool kasan_check_byte(const void *address) { return true; } #endif /* CONFIG_KASAN */ #if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) void kasan_unpoison_task_stack(struct task_struct *task); asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); #else static inline void kasan_unpoison_task_stack(struct task_struct *task) {} static inline void kasan_unpoison_task_stack_below(const void *watermark) {} #endif #ifdef CONFIG_KASAN_GENERIC struct kasan_cache { int alloc_meta_offset; int free_meta_offset; }; size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); #else /* CONFIG_KASAN_GENERIC */ /* Tag-based KASAN modes do not use per-object metadata. */ static inline size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) { return 0; } /* And no cache-related metadata initialization is required. */ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) static inline void *kasan_reset_tag(const void *addr) { return (void *)arch_kasan_reset_tag(addr); } /** * kasan_report - print a report about a bad memory access detected by KASAN * @addr: address of the bad access * @size: size of the bad access * @is_write: whether the bad access is a write or a read * @ip: instruction pointer for the accessibility check or the bad access itself */ bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline void *kasan_reset_tag(const void *addr) { return (void *)addr; } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS*/ #ifdef CONFIG_KASAN_HW_TAGS void kasan_report_async(void); #endif /* CONFIG_KASAN_HW_TAGS */ #ifdef CONFIG_KASAN_GENERIC void __init kasan_init_generic(void); #else static inline void kasan_init_generic(void) { } #endif #ifdef CONFIG_KASAN_SW_TAGS void __init kasan_init_sw_tags(void); #else static inline void kasan_init_sw_tags(void) { } #endif #ifdef CONFIG_KASAN_HW_TAGS void kasan_init_hw_tags_cpu(void); void __init kasan_init_hw_tags(void); #else static inline void kasan_init_hw_tags_cpu(void) { } static inline void kasan_init_hw_tags(void) { } #endif #ifdef CONFIG_KASAN_VMALLOC #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); static inline int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { if (kasan_enabled()) return __kasan_populate_vmalloc(addr, size, gfp_mask); return 0; } void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) { if (kasan_enabled()) return __kasan_release_vmalloc(start, end, free_region_start, free_region_end, flags); } #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size, gfp_t gfp_mask) { return 0; } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) { } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags); static __always_inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { if (kasan_enabled()) return __kasan_unpoison_vmalloc(start, size, flags); return (void *)start; } void __kasan_poison_vmalloc(const void *start, unsigned long size); static __always_inline void kasan_poison_vmalloc(const void *start, unsigned long size) { if (kasan_enabled()) __kasan_poison_vmalloc(start, size); } void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, kasan_vmalloc_flags_t flags); static __always_inline void kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, kasan_vmalloc_flags_t flags) { if (kasan_enabled()) __kasan_unpoison_vmap_areas(vms, nr_vms, flags); } #else /* CONFIG_KASAN_VMALLOC */ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size, gfp_t gfp_mask) { return 0; } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) { } static inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { return (void *)start; } static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } static __always_inline void kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, kasan_vmalloc_flags_t flags) { } #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) /* * These functions allocate and free shadow memory for kernel modules. * They are only required when KASAN_VMALLOC is not supported, as otherwise * shadow memory is allocated by the generic vmalloc handlers. */ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_non_canonical_hook(unsigned long addr); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline void kasan_non_canonical_hook(unsigned long addr) { } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ #endif /* LINUX_KASAN_H */
25 25 25 25 11 11 11 11 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (C) 2019 Netronome Systems, Inc. */ #ifndef __NET_TC_MPLS_H #define __NET_TC_MPLS_H #include <linux/tc_act/tc_mpls.h> #include <net/act_api.h> struct tcf_mpls_params { int tcfm_action; u32 tcfm_label; int action; /* tcf_action */ u8 tcfm_tc; u8 tcfm_ttl; u8 tcfm_bos; __be16 tcfm_proto; struct rcu_head rcu; }; #define ACT_MPLS_TC_NOT_SET 0xff #define ACT_MPLS_BOS_NOT_SET 0xff #define ACT_MPLS_LABEL_NOT_SET 0xffffffff struct tcf_mpls { struct tc_action common; struct tcf_mpls_params __rcu *mpls_p; }; #define to_mpls(a) ((struct tcf_mpls *)a) static inline u32 tcf_mpls_action(const struct tc_action *a) { u32 tcfm_action; rcu_read_lock(); tcfm_action = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_action; rcu_read_unlock(); return tcfm_action; } static inline __be16 tcf_mpls_proto(const struct tc_action *a) { __be16 tcfm_proto; rcu_read_lock(); tcfm_proto = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_proto; rcu_read_unlock(); return tcfm_proto; } static inline u32 tcf_mpls_label(const struct tc_action *a) { u32 tcfm_label; rcu_read_lock(); tcfm_label = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_label; rcu_read_unlock(); return tcfm_label; } static inline u8 tcf_mpls_tc(const struct tc_action *a) { u8 tcfm_tc; rcu_read_lock(); tcfm_tc = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_tc; rcu_read_unlock(); return tcfm_tc; } static inline u8 tcf_mpls_bos(const struct tc_action *a) { u8 tcfm_bos; rcu_read_lock(); tcfm_bos = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_bos; rcu_read_unlock(); return tcfm_bos; } static inline u8 tcf_mpls_ttl(const struct tc_action *a) { u8 tcfm_ttl; rcu_read_lock(); tcfm_ttl = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_ttl; rcu_read_unlock(); return tcfm_ttl; } #endif /* __NET_TC_MPLS_H */
3 3 3 17 17 17 7 8 4 2 4 4 4 4 4 2 3 4 3 4 1 3 1 4 1 4 1 4 4 4 4 4 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 ethtool hooks for cfg80211 * * Copied from cfg.c - originally * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014 Intel Corporation (Author: Johannes Berg) * Copyright (C) 2018, 2022-2023 Intel Corporation */ #include <linux/types.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" static int ieee80211_set_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; guard(wiphy)(local->hw.wiphy); return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); } static void ieee80211_get_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); memset(rp, 0, sizeof(*rp)); guard(wiphy)(local->hw.wiphy); drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "tx_handlers_drop", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) static int ieee80211_get_sset_count(struct net_device *dev, int sset) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int rv = 0; if (sset == ETH_SS_STATS) rv += STA_STATS_LEN; rv += drv_get_et_sset_count(sdata, sset); if (rv == 0) return -EOPNOTSUPP; return rv; } static void ieee80211_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct sta_info *sta; struct ieee80211_local *local = sdata->local; struct station_info sinfo; struct survey_info survey; int i, q; #define STA_STATS_SURVEY_LEN 7 memset(data, 0, sizeof(u64) * STA_STATS_LEN); #define ADD_STA_STATS(sta) \ do { \ data[i++] += sinfo.rx_packets; \ data[i++] += sinfo.rx_bytes; \ data[i++] += (sta)->rx_stats.num_duplicates; \ data[i++] += (sta)->rx_stats.fragments; \ data[i++] += sinfo.rx_dropped_misc; \ \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ data[i++] += (sta)->status_stats.filtered; \ data[i++] += sinfo.tx_failed; \ data[i++] += sinfo.tx_retries; \ } while (0) /* For Managed stations, find the single station based on BSSID * and use that. For interface types, iterate through all available * stations and add stats for any station that is assigned to this * network device. */ guard(wiphy)(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid); if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); data[i++] = sdata->tx_handlers_drop; data[i++] = sta->sta_state; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { list_for_each_entry(sta, &local->sta_list, list) { /* Make sure this station belongs to the proper dev */ if (sta->sdata->dev != dev) continue; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); data[i++] = sdata->tx_handlers_drop; } } do_survey: i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; /* Get survey stats for current channel */ survey.filled = 0; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; else if (local->open_count > 0 && local->open_count == local->virt_monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) channel = local->monitor_chanreq.oper.chan; else channel = NULL; rcu_read_unlock(); if (channel) { q = 0; do { survey.filled = 0; if (drv_get_survey(local, q, &survey) != 0) { survey.filled = 0; break; } q++; } while (channel != survey.channel); } if (survey.filled) data[i++] = survey.channel->center_freq; else data[i++] = 0; if (survey.filled & SURVEY_INFO_NOISE_DBM) data[i++] = (u8)survey.noise; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME) data[i++] = survey.time; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_BUSY) data[i++] = survey.time_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_EXT_BUSY) data[i++] = survey.time_ext_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_RX) data[i++] = survey.time_rx; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_TX) data[i++] = survey.time_tx; else data[i++] = -1LL; if (WARN_ON(i != STA_STATS_LEN)) return; drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int sz_sta_stats = 0; if (sset == ETH_SS_STATS) { sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); } drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); } static int ieee80211_get_regs_len(struct net_device *dev) { return 0; } static void ieee80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; regs->version = wdev->wiphy->hw_version; regs->len = 0; } const struct ethtool_ops ieee80211_ethtool_ops = { .get_drvinfo = cfg80211_get_drvinfo, .get_regs_len = ieee80211_get_regs_len, .get_regs = ieee80211_get_regs, .get_link = ethtool_op_get_link, .get_ringparam = ieee80211_get_ringparam, .set_ringparam = ieee80211_set_ringparam, .get_strings = ieee80211_get_strings, .get_ethtool_stats = ieee80211_get_stats, .get_sset_count = ieee80211_get_sset_count, };
99 75 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 // SPDX-License-Identifier: GPL-2.0 /* * file.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * * debugfs is for people to use instead of /proc or /sys. * See Documentation/filesystems/ for more details. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/poll.h> #include <linux/security.h> #include "internal.h" struct poll_table_struct; static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static ssize_t default_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return count; } const struct file_operations debugfs_noop_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; #define F_DENTRY(filp) ((filp)->f_path.dentry) void *debugfs_get_aux(const struct file *file) { return DEBUGFS_I(file_inode(file))->aux; } EXPORT_SYMBOL_GPL(debugfs_get_aux); enum dbgfs_get_mode { DBGFS_GET_ALREADY, DBGFS_GET_REGULAR, DBGFS_GET_SHORT, }; static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode) { struct debugfs_fsdata *fsd; void *d_fsd; /* * This could only happen if some debugfs user erroneously calls * debugfs_file_get() on a dentry that isn't even a file, let * them know about it. */ if (WARN_ON(!d_is_reg(dentry))) return -EINVAL; d_fsd = READ_ONCE(dentry->d_fsdata); if (d_fsd) { fsd = d_fsd; } else { struct inode *inode = dentry->d_inode; unsigned int methods = 0; if (WARN_ON(mode == DBGFS_GET_ALREADY)) return -EINVAL; fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) return -ENOMEM; if (mode == DBGFS_GET_SHORT) { const struct debugfs_short_fops *ops; ops = fsd->short_fops = DEBUGFS_I(inode)->short_fops; if (ops->llseek) methods |= HAS_LSEEK; if (ops->read) methods |= HAS_READ; if (ops->write) methods |= HAS_WRITE; fsd->real_fops = NULL; } else { const struct file_operations *ops; ops = fsd->real_fops = DEBUGFS_I(inode)->real_fops; if (ops->llseek) methods |= HAS_LSEEK; if (ops->read) methods |= HAS_READ; if (ops->write) methods |= HAS_WRITE; if (ops->unlocked_ioctl) methods |= HAS_IOCTL; if (ops->poll) methods |= HAS_POLL; fsd->short_fops = NULL; } fsd->methods = methods; refcount_set(&fsd->active_users, 1); init_completion(&fsd->active_users_drained); INIT_LIST_HEAD(&fsd->cancellations); mutex_init(&fsd->cancellations_mtx); d_fsd = cmpxchg(&dentry->d_fsdata, NULL, fsd); if (d_fsd) { mutex_destroy(&fsd->cancellations_mtx); kfree(fsd); fsd = d_fsd; } } /* * In case of a successful cmpxchg() above, this check is * strictly necessary and must follow it, see the comment in * __debugfs_remove_file(). * OTOH, if the cmpxchg() hasn't been executed or wasn't * successful, this serves the purpose of not starving * removers. */ if (d_unlinked(dentry)) return -EIO; if (!refcount_inc_not_zero(&fsd->active_users)) return -EIO; return 0; } /** * debugfs_file_get - mark the beginning of file data access * @dentry: the dentry object whose data is being accessed. * * Up to a matching call to debugfs_file_put(), any successive call * into the file removing functions debugfs_remove() and * debugfs_remove_recursive() will block. Since associated private * file data may only get freed after a successful return of any of * the removal functions, you may safely access it after a successful * call to debugfs_file_get() without worrying about lifetime issues. * * If -%EIO is returned, the file has already been removed and thus, * it is not safe to access any of its data. If, on the other hand, * it is allowed to access the file data, zero is returned. */ int debugfs_file_get(struct dentry *dentry) { return __debugfs_file_get(dentry, DBGFS_GET_ALREADY); } EXPORT_SYMBOL_GPL(debugfs_file_get); /** * debugfs_file_put - mark the end of file data access * @dentry: the dentry object formerly passed to * debugfs_file_get(). * * Allow any ongoing concurrent call into debugfs_remove() or * debugfs_remove_recursive() blocked by a former call to * debugfs_file_get() to proceed and return to its caller. */ void debugfs_file_put(struct dentry *dentry) { struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata); if (refcount_dec_and_test(&fsd->active_users)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_file_put); /** * debugfs_enter_cancellation - enter a debugfs cancellation * @file: the file being accessed * @cancellation: the cancellation object, the cancel callback * inside of it must be initialized * * When a debugfs file is removed it needs to wait for all active * operations to complete. However, the operation itself may need * to wait for hardware or completion of some asynchronous process * or similar. As such, it may need to be cancelled to avoid long * waits or even deadlocks. * * This function can be used inside a debugfs handler that may * need to be cancelled. As soon as this function is called, the * cancellation's 'cancel' callback may be called, at which point * the caller should proceed to call debugfs_leave_cancellation() * and leave the debugfs handler function as soon as possible. * Note that the 'cancel' callback is only ever called in the * context of some kind of debugfs_remove(). * * This function must be paired with debugfs_leave_cancellation(). */ void debugfs_enter_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); INIT_LIST_HEAD(&cancellation->list); if (WARN_ON(!d_is_reg(dentry))) return; if (WARN_ON(!cancellation->cancel)) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd)) return; mutex_lock(&fsd->cancellations_mtx); list_add(&cancellation->list, &fsd->cancellations); mutex_unlock(&fsd->cancellations_mtx); /* if we're already removing wake it up to cancel */ if (d_unlinked(dentry)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_enter_cancellation); /** * debugfs_leave_cancellation - leave cancellation section * @file: the file being accessed * @cancellation: the cancellation previously registered with * debugfs_enter_cancellation() * * See the documentation of debugfs_enter_cancellation(). */ void debugfs_leave_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); if (WARN_ON(!d_is_reg(dentry))) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd)) return; mutex_lock(&fsd->cancellations_mtx); if (!list_empty(&cancellation->list)) list_del(&cancellation->list); mutex_unlock(&fsd->cancellations_mtx); } EXPORT_SYMBOL_GPL(debugfs_leave_cancellation); /* * Only permit access to world-readable files when the kernel is locked down. * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && (!real_fops || (!real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap))) return 0; if (security_locked_down(LOCKDOWN_DEBUGFS)) return -EPERM; return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = DEBUGFS_I(inode)->real_fops; int r; r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR); if (r) return r == -EIO ? -ENOENT : r; r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not clean up after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } replace_fops(filp, real_fops); if (real_fops->open) r = real_fops->open(inode, filp); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_open_proxy_file_operations = { .open = open_proxy_open, }; #define PROTO(args...) args #define ARGS(args...) args #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret) \ static ret_type full_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ struct debugfs_fsdata *fsd = dentry->d_fsdata; \ ret_type r; \ \ if (!(fsd->methods & bit)) \ return ret; \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ r = fsd->real_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } #define SHORT_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret) \ static ret_type short_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ struct debugfs_fsdata *fsd = dentry->d_fsdata; \ ret_type r; \ \ if (!(fsd->methods & bit)) \ return ret; \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ r = fsd->short_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } SHORT_PROXY_FUNC(llseek, loff_t, filp, PROTO(struct file *filp, loff_t offset, int whence), ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE); FULL_PROXY_FUNC(llseek, loff_t, filp, PROTO(struct file *filp, loff_t offset, int whence), ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE); SHORT_PROXY_FUNC(read, ssize_t, filp, PROTO(struct file *filp, char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL); FULL_PROXY_FUNC(read, ssize_t, filp, PROTO(struct file *filp, char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL); SHORT_PROXY_FUNC(write, ssize_t, filp, PROTO(struct file *filp, const char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL); FULL_PROXY_FUNC(write, ssize_t, filp, PROTO(struct file *filp, const char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL); FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), ARGS(filp, cmd, arg), HAS_IOCTL, -ENOTTY); static __poll_t full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { struct dentry *dentry = F_DENTRY(filp); struct debugfs_fsdata *fsd = dentry->d_fsdata; __poll_t r = 0; if (!(fsd->methods & HAS_POLL)) return DEFAULT_POLLMASK; if (debugfs_file_get(dentry)) return EPOLLHUP; r = fsd->real_fops->poll(filp, wait); debugfs_file_put(dentry); return r; } static int full_proxy_release(struct inode *inode, struct file *file) { struct debugfs_fsdata *fsd = F_DENTRY(file)->d_fsdata; const struct file_operations *real_fops = fsd->real_fops; int r = 0; /* * We must not protect this against removal races here: the * original releaser should be called unconditionally in order * not to leak any resources. Releasers must not assume that * ->i_private is still being meaningful here. */ if (real_fops->release) r = real_fops->release(inode, file); fops_put(real_fops); return r; } static int full_proxy_open_regular(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops; struct debugfs_fsdata *fsd; int r; r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR); if (r) return r == -EIO ? -ENOENT : r; fsd = dentry->d_fsdata; real_fops = fsd->real_fops; r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not cleanup after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } if (real_fops->open) { r = real_fops->open(inode, filp); if (r) { fops_put(real_fops); } else if (filp->f_op != &debugfs_full_proxy_file_operations) { /* No protection against file removal anymore. */ WARN(1, "debugfs file owner replaced proxy fops: %pd", dentry); fops_put(real_fops); } } out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_full_proxy_file_operations = { .open = full_proxy_open_regular, .release = full_proxy_release, .llseek = full_proxy_llseek, .read = full_proxy_read, .write = full_proxy_write, .poll = full_proxy_poll, .unlocked_ioctl = full_proxy_unlocked_ioctl }; static int full_proxy_open_short(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); int r; r = __debugfs_file_get(dentry, DBGFS_GET_SHORT); if (r) return r == -EIO ? -ENOENT : r; r = debugfs_locked_down(inode, filp, NULL); if (!r) r = simple_open(inode, filp); debugfs_file_put(dentry); return r; } const struct file_operations debugfs_full_short_proxy_file_operations = { .open = full_proxy_open_short, .llseek = short_proxy_llseek, .read = short_proxy_read, .write = short_proxy_write, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; ret = simple_attr_read(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } EXPORT_SYMBOL_GPL(debugfs_attr_read); static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; if (is_signed) ret = simple_attr_write_signed(file, buf, len, ppos); else ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(debugfs_attr_write); ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, const struct file_operations *fops_ro, const struct file_operations *fops_wo) { /* if there are no write bits set, make read only */ if (!(mode & S_IWUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_ro); /* if there are no read bits set, make write only */ if (!(mode & S_IRUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_wo); return debugfs_create_file_unsafe(name, mode, parent, value, fops); } static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; return 0; } static int debugfs_u8_get(void *data, u64 *val) { *val = *(u8 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); static int debugfs_u16_set(void *data, u64 val) { *(u16 *)data = val; return 0; } static int debugfs_u16_get(void *data, u64 *val) { *val = *(u16 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); static int debugfs_u32_set(void *data, u64 val) { *(u32 *)data = val; return 0; } static int debugfs_u32_get(void *data, u64 *val) { *val = *(u32 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); static int debugfs_u64_set(void *data, u64 val) { *(u64 *)data = val; return 0; } static int debugfs_u64_get(void *data, u64 *val) { *val = *(u64 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); static int debugfs_ulong_set(void *data, u64 val) { *(unsigned long *)data = val; return 0; } static int debugfs_ulong_get(void *data, u64 *val) { *val = *(unsigned long *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); /** * debugfs_create_ulong - create a debugfs file that is used to read and write * an unsigned long value. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong, &fops_ulong_ro, &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value * * These functions are exactly the same as the above functions (but use a hex * output for the decimal challenged). For details look at the above unsigned * decimal functions. */ /** * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); /** * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); /** * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); /** * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); static int debugfs_size_t_set(void *data, u64 val) { *(size_t *)data = val; return 0; } static int debugfs_size_t_get(void *data, u64 *val) { *val = *(size_t *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t, &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); static int debugfs_atomic_t_set(void *data, u64 val) { atomic_set((atomic_t *)data, val); return 0; } static int debugfs_atomic_t_get(void *data, u64 *val) { *val = atomic_read((atomic_t *)data); return 0; } DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** * debugfs_create_atomic_t - create a debugfs file that is used to read and * write an atomic_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t, &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); r = debugfs_file_get(dentry); if (unlikely(r)) return r; val = *(bool *)file->private_data; debugfs_file_put(dentry); if (val) buf[0] = 'Y'; else buf[0] = 'N'; buf[1] = '\n'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool bv; int r; bool *val = file->private_data; struct dentry *dentry = F_DENTRY(file); r = kstrtobool_from_user(user_buf, count, &bv); if (!r) { r = debugfs_file_get(dentry); if (unlikely(r)) return r; *val = bv; debugfs_file_put(dentry); } return count; } EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { .read = debugfs_read_file_bool, .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_ro = { .read = debugfs_read_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_wo = { .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *str, *copy = NULL; int copy_len, len; ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; str = *(char **)file->private_data; len = strlen(str) + 1; copy = kmalloc(len, GFP_KERNEL); if (!copy) { debugfs_file_put(dentry); return -ENOMEM; } copy_len = strscpy(copy, str, len); debugfs_file_put(dentry); if (copy_len < 0) { kfree(copy); return copy_len; } copy[copy_len] = '\n'; ret = simple_read_from_buffer(user_buf, count, ppos, copy, len); kfree(copy); return ret; } EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *old, *new = NULL; int pos = *ppos; int r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; old = *(char **)file->private_data; /* only allow strict concatenation */ r = -EINVAL; if (pos && pos != strlen(old)) goto error; r = -E2BIG; if (pos + count + 1 > PAGE_SIZE) goto error; r = -ENOMEM; new = kmalloc(pos + count + 1, GFP_KERNEL); if (!new) goto error; if (pos) memcpy(new, old, pos); r = -EFAULT; if (copy_from_user(new + pos, user_buf, count)) goto error; new[pos + count] = '\0'; strim(new); rcu_assign_pointer(*(char __rcu **)file->private_data, new); synchronize_rcu(); kfree(old); debugfs_file_put(dentry); return count; error: kfree(new); debugfs_file_put(dentry); return r; } static const struct file_operations fops_str = { .read = debugfs_read_file_str, .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_ro = { .read = debugfs_read_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_wo = { .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_str - create a debugfs file that is used to read and write a string value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str, &fops_str_ro, &fops_str_wo); } static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_read_from_buffer(user_buf, count, ppos, blob->data, blob->size); debugfs_file_put(dentry); return r; } static ssize_t write_file_blob(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf, count); debugfs_file_put(dentry); return r; } static const struct file_operations fops_blob = { .read = read_file_blob, .write = write_file_blob, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_blob - create a debugfs file that is used to read and write * a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer * to the blob data and the size of the data. * * This function creates a file in debugfs with the given name that exports * @blob->data as a binary blob. If the @mode variable is so set it can be * read from and written to. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); static size_t u32_format_array(char *buf, size_t bufsize, u32 *array, int array_size) { size_t ret = 0; while (--array_size >= 0) { size_t len; char term = array_size ? ' ' : '\n'; len = snprintf(buf, bufsize, "%u%c", *array++, term); ret += len; buf += len; bufsize -= len; } return ret; } static int u32_array_open(struct inode *inode, struct file *file) { struct debugfs_u32_array *data = inode->i_private; int size, elements = data->n_elements; char *buf; /* * Max size: * - 10 digits + ' '/'\n' = 11 bytes per number * - terminating NUL character */ size = elements*11; buf = kmalloc(size+1, GFP_KERNEL); if (!buf) return -ENOMEM; buf[size] = 0; file->private_data = buf; u32_format_array(buf, size, data->array, data->n_elements); return nonseekable_open(inode, file); } static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { size_t size = strlen(file->private_data); return simple_read_from_buffer(buf, len, ppos, file->private_data, size); } static int u32_array_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release = u32_array_release, .read = u32_array_read, }; /** * debugfs_create_u32_array - create a debugfs file that is used to read u32 * array. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @array: wrapper struct containing data pointer and size of the array. * * This function creates a file in debugfs with the given name that exports * @array as data. If the @mode variable is so set it can be read from. * Writing is not supported. Seek within the file is also not supported. * Once array is created its size can not be changed. */ void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, struct debugfs_u32_array *array) { debugfs_create_file_unsafe(name, mode, parent, array, &u32_array_fops); } EXPORT_SYMBOL_GPL(debugfs_create_u32_array); #ifdef CONFIG_HAS_IOMEM /* * The regset32 stuff is used to print 32-bit registers using the * seq_file utilities. We offer printing a register set in an already-opened * sequential file or create a debugfs file that only prints a regset32. */ /** * debugfs_print_regs32 - use seq_print to describe a set of registers * @s: the seq_file structure being used to generate output * @regs: an array if struct debugfs_reg32 structures * @nregs: the length of the above array * @base: the base address to be used in reading the registers * @prefix: a string to be prefixed to every output line * * This function outputs a text block describing the current values of * some 32-bit hardware registers. It is meant to be used within debugfs * files based on seq_file that need to show registers, intermixed with other * information. The prefix argument may be used to specify a leading string, * because some peripherals have several blocks of identical registers, * for example configuration of dma channels */ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, int nregs, void __iomem *base, char *prefix) { int i; for (i = 0; i < nregs; i++, regs++) { if (prefix) seq_printf(s, "%s", prefix); seq_printf(s, "%s = 0x%08x\n", regs->name, readl(base + regs->offset)); if (seq_has_overflowed(s)) break; } } EXPORT_SYMBOL_GPL(debugfs_print_regs32); static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; if (regset->dev) pm_runtime_get_sync(regset->dev); debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); if (regset->dev) pm_runtime_put(regset->dev); return 0; } DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @regset: a pointer to a struct debugfs_regset32, which contains a pointer * to an array of register definitions, the array size and the base * address where the register bank is to be found. * * This function creates a file in debugfs with the given name that reports * the names and values of a set of 32-bit registers. If the @mode variable * is so set it can be read from. Writing is not supported. */ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); #endif /* CONFIG_HAS_IOMEM */ struct debugfs_devm_entry { int (*read)(struct seq_file *seq, void *data); struct device *dev; }; static int debugfs_devm_entry_open(struct inode *inode, struct file *f) { struct debugfs_devm_entry *entry = inode->i_private; return single_open(f, entry->read, entry->dev); } static const struct file_operations debugfs_devm_entry_ops = { .owner = THIS_MODULE, .open = debugfs_devm_entry_open, .release = single_release, .read = seq_read, .llseek = seq_lseek }; /** * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. * * @dev: device related to this debugfs file. * @name: name of the debugfs file. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @read_fn: function pointer called to print the seq_file content. */ void debugfs_create_devm_seqfile(struct device *dev, const char *name, struct dentry *parent, int (*read_fn)(struct seq_file *s, void *data)) { struct debugfs_devm_entry *entry; if (IS_ERR(parent)) return; entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); if (!entry) return; entry->read = read_fn; entry->dev = dev; debugfs_create_file(name, S_IRUGO, parent, entry, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);
13 13 13 13 13 13 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 // SPDX-License-Identifier: GPL-2.0-or-later /* * RSA Signature Scheme with Appendix - PKCS #1 v1.5 (RFC 8017 sec 8.2) * * https://www.rfc-editor.org/rfc/rfc8017#section-8.2 * * Copyright (c) 2015 - 2024 Intel Corporation */ #include <linux/module.h> #include <linux/scatterlist.h> #include <crypto/akcipher.h> #include <crypto/algapi.h> #include <crypto/hash.h> #include <crypto/sig.h> #include <crypto/internal/akcipher.h> #include <crypto/internal/rsa.h> #include <crypto/internal/sig.h> /* * Full Hash Prefix for EMSA-PKCS1-v1_5 encoding method (RFC 9580 table 24) * * RSA keys are usually much larger than the hash of the message to be signed. * The hash is therefore prepended by the Full Hash Prefix and a 0xff padding. * The Full Hash Prefix is an ASN.1 SEQUENCE containing the hash algorithm OID. * * https://www.rfc-editor.org/rfc/rfc9580#table-24 */ static const u8 hash_prefix_none[] = { }; static const u8 hash_prefix_md5[] = { 0x30, 0x20, 0x30, 0x0c, 0x06, 0x08, /* SEQUENCE (SEQUENCE (OID */ 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05, /* <algorithm>, */ 0x05, 0x00, 0x04, 0x10 /* NULL), OCTET STRING <hash>) */ }; static const u8 hash_prefix_sha1[] = { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, 0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 }; static const u8 hash_prefix_rmd160[] = { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x24, 0x03, 0x02, 0x01, 0x05, 0x00, 0x04, 0x14 }; static const u8 hash_prefix_sha224[] = { 0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04, 0x05, 0x00, 0x04, 0x1c }; static const u8 hash_prefix_sha256[] = { 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0x05, 0x00, 0x04, 0x20 }; static const u8 hash_prefix_sha384[] = { 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, 0x05, 0x00, 0x04, 0x30 }; static const u8 hash_prefix_sha512[] = { 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, 0x05, 0x00, 0x04, 0x40 }; static const u8 hash_prefix_sha3_256[] = { 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x08, 0x05, 0x00, 0x04, 0x20 }; static const u8 hash_prefix_sha3_384[] = { 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x09, 0x05, 0x00, 0x04, 0x30 }; static const u8 hash_prefix_sha3_512[] = { 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0a, 0x05, 0x00, 0x04, 0x40 }; static const struct hash_prefix { const char *name; const u8 *data; size_t size; } hash_prefixes[] = { #define _(X) { #X, hash_prefix_##X, sizeof(hash_prefix_##X) } _(none), _(md5), _(sha1), _(rmd160), _(sha256), _(sha384), _(sha512), _(sha224), #undef _ #define _(X) { "sha3-" #X, hash_prefix_sha3_##X, sizeof(hash_prefix_sha3_##X) } _(256), _(384), _(512), #undef _ { NULL } }; static const struct hash_prefix *rsassa_pkcs1_find_hash_prefix(const char *name) { const struct hash_prefix *p; for (p = hash_prefixes; p->name; p++) if (strcmp(name, p->name) == 0) return p; return NULL; } static bool rsassa_pkcs1_invalid_hash_len(unsigned int len, const struct hash_prefix *p) { /* * Legacy protocols such as TLS 1.1 or earlier and IKE version 1 * do not prepend a Full Hash Prefix to the hash. In that case, * the size of the Full Hash Prefix is zero. */ if (p->data == hash_prefix_none) return false; /* * The final byte of the Full Hash Prefix encodes the hash length. * * This needs to be revisited should hash algorithms with more than * 1016 bits (127 bytes * 8) ever be added. The length would then * be encoded into more than one byte by ASN.1. */ static_assert(HASH_MAX_DIGESTSIZE <= 127); return len != p->data[p->size - 1]; } struct rsassa_pkcs1_ctx { struct crypto_akcipher *child; unsigned int key_size; }; struct rsassa_pkcs1_inst_ctx { struct crypto_akcipher_spawn spawn; const struct hash_prefix *hash_prefix; }; static int rsassa_pkcs1_sign(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct sig_instance *inst = sig_alg_instance(tfm); struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst); const struct hash_prefix *hash_prefix = ictx->hash_prefix; struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); unsigned int pad_len; unsigned int ps_end; unsigned int len; u8 *in_buf; int err; if (!ctx->key_size) return -EINVAL; if (dlen < ctx->key_size) return -EOVERFLOW; if (rsassa_pkcs1_invalid_hash_len(slen, hash_prefix)) return -EINVAL; if (slen + hash_prefix->size > ctx->key_size - 11) return -EOVERFLOW; pad_len = ctx->key_size - slen - hash_prefix->size - 1; /* RFC 8017 sec 8.2.1 step 1 - EMSA-PKCS1-v1_5 encoding generation */ in_buf = dst; memmove(in_buf + pad_len + hash_prefix->size, src, slen); memcpy(in_buf + pad_len, hash_prefix->data, hash_prefix->size); ps_end = pad_len - 1; in_buf[0] = 0x01; memset(in_buf + 1, 0xff, ps_end - 1); in_buf[ps_end] = 0x00; /* RFC 8017 sec 8.2.1 step 2 - RSA signature */ err = crypto_akcipher_sync_decrypt(ctx->child, in_buf, ctx->key_size - 1, in_buf, ctx->key_size); if (err < 0) return err; len = err; pad_len = ctx->key_size - len; /* Four billion to one */ if (unlikely(pad_len)) { memmove(dst + pad_len, dst, len); memset(dst, 0, pad_len); } return ctx->key_size; } static int rsassa_pkcs1_verify(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *digest, unsigned int dlen) { struct sig_instance *inst = sig_alg_instance(tfm); struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst); const struct hash_prefix *hash_prefix = ictx->hash_prefix; struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); unsigned int child_reqsize = crypto_akcipher_reqsize(ctx->child); struct akcipher_request *child_req __free(kfree_sensitive) = NULL; struct crypto_wait cwait; struct scatterlist sg; unsigned int dst_len; unsigned int pos; u8 *out_buf; int err; /* RFC 8017 sec 8.2.2 step 1 - length checking */ if (!ctx->key_size || slen != ctx->key_size || rsassa_pkcs1_invalid_hash_len(dlen, hash_prefix)) return -EINVAL; /* RFC 8017 sec 8.2.2 step 2 - RSA verification */ child_req = kmalloc(sizeof(*child_req) + child_reqsize + ctx->key_size, GFP_KERNEL); if (!child_req) return -ENOMEM; out_buf = (u8 *)(child_req + 1) + child_reqsize; memcpy(out_buf, src, slen); crypto_init_wait(&cwait); sg_init_one(&sg, out_buf, slen); akcipher_request_set_tfm(child_req, ctx->child); akcipher_request_set_crypt(child_req, &sg, &sg, slen, slen); akcipher_request_set_callback(child_req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &cwait); err = crypto_akcipher_encrypt(child_req); err = crypto_wait_req(err, &cwait); if (err) return err; /* RFC 8017 sec 8.2.2 step 3 - EMSA-PKCS1-v1_5 encoding verification */ dst_len = child_req->dst_len; if (dst_len < ctx->key_size - 1) return -EINVAL; if (dst_len == ctx->key_size) { if (out_buf[0] != 0x00) /* Encrypted value had no leading 0 byte */ return -EINVAL; dst_len--; out_buf++; } if (out_buf[0] != 0x01) return -EBADMSG; for (pos = 1; pos < dst_len; pos++) if (out_buf[pos] != 0xff) break; if (pos < 9 || pos == dst_len || out_buf[pos] != 0x00) return -EBADMSG; pos++; if (hash_prefix->size > dst_len - pos) return -EBADMSG; if (crypto_memneq(out_buf + pos, hash_prefix->data, hash_prefix->size)) return -EBADMSG; pos += hash_prefix->size; /* RFC 8017 sec 8.2.2 step 4 - comparison of digest with out_buf */ if (dlen != dst_len - pos) return -EKEYREJECTED; if (memcmp(digest, out_buf + pos, dlen) != 0) return -EKEYREJECTED; return 0; } static unsigned int rsassa_pkcs1_key_size(struct crypto_sig *tfm) { struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); return ctx->key_size * BITS_PER_BYTE; } static int rsassa_pkcs1_set_pub_key(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); return rsa_set_key(ctx->child, &ctx->key_size, RSA_PUB, key, keylen); } static int rsassa_pkcs1_set_priv_key(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); return rsa_set_key(ctx->child, &ctx->key_size, RSA_PRIV, key, keylen); } static int rsassa_pkcs1_init_tfm(struct crypto_sig *tfm) { struct sig_instance *inst = sig_alg_instance(tfm); struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst); struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); struct crypto_akcipher *child_tfm; child_tfm = crypto_spawn_akcipher(&ictx->spawn); if (IS_ERR(child_tfm)) return PTR_ERR(child_tfm); ctx->child = child_tfm; return 0; } static void rsassa_pkcs1_exit_tfm(struct crypto_sig *tfm) { struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); crypto_free_akcipher(ctx->child); } static void rsassa_pkcs1_free(struct sig_instance *inst) { struct rsassa_pkcs1_inst_ctx *ctx = sig_instance_ctx(inst); struct crypto_akcipher_spawn *spawn = &ctx->spawn; crypto_drop_akcipher(spawn); kfree(inst); } static int rsassa_pkcs1_create(struct crypto_template *tmpl, struct rtattr **tb) { struct rsassa_pkcs1_inst_ctx *ctx; struct akcipher_alg *rsa_alg; struct sig_instance *inst; const char *hash_name; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SIG, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = sig_instance_ctx(inst); err = crypto_grab_akcipher(&ctx->spawn, sig_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; rsa_alg = crypto_spawn_akcipher_alg(&ctx->spawn); if (strcmp(rsa_alg->base.cra_name, "rsa") != 0) { err = -EINVAL; goto err_free_inst; } hash_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(hash_name)) { err = PTR_ERR(hash_name); goto err_free_inst; } ctx->hash_prefix = rsassa_pkcs1_find_hash_prefix(hash_name); if (!ctx->hash_prefix) { err = -EINVAL; goto err_free_inst; } err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "pkcs1(%s,%s)", rsa_alg->base.cra_name, hash_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "pkcs1(%s,%s)", rsa_alg->base.cra_driver_name, hash_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = rsa_alg->base.cra_priority; inst->alg.base.cra_ctxsize = sizeof(struct rsassa_pkcs1_ctx); inst->alg.init = rsassa_pkcs1_init_tfm; inst->alg.exit = rsassa_pkcs1_exit_tfm; inst->alg.sign = rsassa_pkcs1_sign; inst->alg.verify = rsassa_pkcs1_verify; inst->alg.key_size = rsassa_pkcs1_key_size; inst->alg.set_pub_key = rsassa_pkcs1_set_pub_key; inst->alg.set_priv_key = rsassa_pkcs1_set_priv_key; inst->free = rsassa_pkcs1_free; err = sig_register_instance(tmpl, inst); if (err) { err_free_inst: rsassa_pkcs1_free(inst); } return err; } struct crypto_template rsassa_pkcs1_tmpl = { .name = "pkcs1", .create = rsassa_pkcs1_create, .module = THIS_MODULE, }; MODULE_ALIAS_CRYPTO("pkcs1");
4675 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC64_64_H #define _ASM_X86_ATOMIC64_64_H #include <linux/types.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> /* The 64-bit atomic type */ #define ATOMIC64_INIT(i) { (i) } static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __READ_ONCE((v)->counter); } static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __WRITE_ONCE(v->counter, i); } static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { asm_inline volatile(LOCK_PREFIX "addq %1, %0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm_inline volatile(LOCK_PREFIX "subq %1, %0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm_inline volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } #define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm_inline volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } #define arch_atomic64_dec arch_atomic64_dec static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } #define arch_atomic64_add_negative arch_atomic64_add_negative static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic64_add_return arch_atomic64_add_return #define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v) static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add #define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v) static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } #define arch_atomic64_xchg arch_atomic64_xchg static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { asm_inline volatile(LOCK_PREFIX "andq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); } static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); return val; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { asm_inline volatile(LOCK_PREFIX "orq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); } static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); return val; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { asm_inline volatile(LOCK_PREFIX "xorq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); } static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); return val; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor #endif /* _ASM_X86_ATOMIC64_64_H */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 // SPDX-License-Identifier: GPL-2.0-only #include <linux/crc-ccitt.h> #include <linux/export.h> #include <linux/module.h> #include <linux/types.h> /* * This mysterious table is just the CRC of each possible byte. It can be * computed using the standard bit-at-a-time methods. The polynomial can * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12. * Add the implicit x^16, and you have the standard CRC-CCITT. */ u16 const crc_ccitt_table[256] = { 0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf, 0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7, 0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e, 0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876, 0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd, 0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5, 0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c, 0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974, 0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb, 0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3, 0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a, 0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72, 0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9, 0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1, 0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738, 0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70, 0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7, 0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff, 0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036, 0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e, 0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5, 0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd, 0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134, 0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c, 0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3, 0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb, 0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232, 0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a, 0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1, 0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9, 0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330, 0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78 }; EXPORT_SYMBOL(crc_ccitt_table); /** * crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data * buffer * @crc: previous CRC value * @buffer: data pointer * @len: number of bytes in the buffer */ u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len) { while (len--) crc = crc_ccitt_byte(crc, *buffer++); return crc; } EXPORT_SYMBOL(crc_ccitt); MODULE_DESCRIPTION("CRC-CCITT calculations"); MODULE_LICENSE("GPL");
57 57 29 29 4 24 1 27 15 5 5 4 1 21 21 1 5 14 4 15 13 8 13 7 2 3 13 13 3 10 10 10 59 132 1 5 181 17 166 34 4 30 28 28 28 28 28 28 28 28 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 fragment reassembly for connection tracking * * Copyright (C)2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * * Based on: net/ipv6/reassembly.c */ #define pr_fmt(fmt) "IPv6-nf: " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6_frag.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/kernel.h> #include <linux/module.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netns/generic.h> static const char nf_frags_cache_name[] = "nf-frags"; static unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) { return net_generic(net, nf_frag_pernet_id); } #ifdef CONFIG_SYSCTL static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "nf_conntrack_frag6_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; static int nf_ct_frag6_sysctl_register(struct net *net) { struct nft_ct_frag6_pernet *nf_frag; struct ctl_table *table; struct ctl_table_header *hdr; table = nf_ct_frag6_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; } nf_frag = nf_frag_pernet(net); table[0].data = &nf_frag->fqdir->timeout; table[1].data = &nf_frag->fqdir->low_thresh; table[1].extra2 = &nf_frag->fqdir->high_thresh; table[2].data = &nf_frag->fqdir->high_thresh; table[2].extra1 = &nf_frag->fqdir->low_thresh; hdr = register_net_sysctl_sz(net, "net/netfilter", table, ARRAY_SIZE(nf_ct_frag6_sysctl_table)); if (hdr == NULL) goto err_reg; nf_frag->nf_frag_frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); const struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else static int nf_ct_frag6_sysctl_register(struct net *net) { return 0; } static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { } #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, const struct ipv6hdr *hdr, int iif) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, .daddr = hdr->daddr, .user = user, .iif = iif, }; struct inet_frag_queue *q; if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL))) key.iif = 0; q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; return container_of(q, struct frag_queue, q); } static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff, int *refs) { unsigned int payload_len; struct net_device *dev; struct sk_buff *prev; int offset, end, err; u8 ecn; if (fq->q.flags & INET_FRAG_COMPLETE) { pr_debug("Already completed\n"); goto err; } payload_len = ntohs(ipv6_hdr(skb)->payload_len); offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (payload_len - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { pr_debug("offset is too large.\n"); return -EINVAL; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, csum_partial(nh, (u8 *)(fhdr + 1) - nh, 0)); } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) { pr_debug("already received last fragment\n"); goto err; } fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. * Required by the RFC. */ if (end & 0x7) { /* RFC2460 says always send parameter problem in * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); inet_frag_kill(&fq->q, refs); return -EPROTO; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) { pr_debug("last packet already reached.\n"); goto err; } fq->q.len = end; } } if (end == offset) goto err; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { pr_debug("queue: message is too short.\n"); goto err; } if (pskb_trim_rcsum(skb, end - offset)) { pr_debug("Can't trim\n"); goto err; } /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) { if (err == IPFRAG_DUP) { /* No error for duplicates, pretend they got queued. */ kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG); return -EINPROGRESS; } goto insert_error; } if (dev) fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; fq->q.flags |= INET_FRAG_FIRST_IN; } if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or * -EINPROGRESS must be returned. */ return err ? -EINPROGRESS : 0; } skb_dst_drop(skb); skb_orphan(skb); return -EINPROGRESS; insert_error: inet_frag_kill(&fq->q, refs); err: skb_dst_drop(skb); return -EINVAL; } /* * Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs) { void *reasm_data; int payload_len; u8 ecn; inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) goto err; reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto err; payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) { net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", payload_len); goto err; } /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0]; memmove(skb->head + sizeof(struct frag_hdr), skb->head, (skb->data - skb->head) - sizeof(struct frag_hdr)); skb->mac_header += sizeof(struct frag_hdr); skb->network_header += sizeof(struct frag_hdr); skb_reset_transport_header(skb); inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->ignore_df = 1; skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_partial(skb_network_header(skb), skb_network_header_len(skb), skb->csum); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 0; err: inet_frag_kill(&fq->q, refs); return -EINVAL; } /* * find the header just before Fragment Header. * * if success return 0 and set ... * (*prevhdrp): the value of "Next Header Field" in the header * just before Fragment Header. * (*prevhoff): the offset of "Next Header Field" in the header * just before Fragment Header. * (*fhoff) : the offset of Fragment Header. * * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c * */ static int find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) { u8 nexthdr = ipv6_hdr(skb)->nexthdr; const int netoff = skb_network_offset(skb); u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); int start = netoff + sizeof(struct ipv6hdr); int len = skb->len - start; u8 prevhdr = NEXTHDR_IPV6; while (nexthdr != NEXTHDR_FRAGMENT) { struct ipv6_opt_hdr hdr; int hdrlen; if (!ipv6_ext_hdr(nexthdr)) { return -1; } if (nexthdr == NEXTHDR_NONE) { pr_debug("next header is none\n"); return -1; } if (len < (int)sizeof(struct ipv6_opt_hdr)) { pr_debug("too short\n"); return -1; } if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) BUG(); if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(&hdr); else hdrlen = ipv6_optlen(&hdr); prevhdr = nexthdr; prev_nhoff = start; nexthdr = hdr.nexthdr; len -= hdrlen; start += hdrlen; } if (len < 0) return -1; *prevhdrp = prevhdr; *prevhoff = prev_nhoff; *fhoff = start; return 0; } int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; u8 nexthdr = NEXTHDR_FRAGMENT; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; int refs = 0; u8 prevhdr; /* Jumbo payload inhibits frag. header */ if (ipv6_hdr(skb)->payload_len == 0) { pr_debug("payload len = 0\n"); return 0; } if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) return 0; /* Discard the first fragment if it does not include all headers * RFC 8200, Section 4.5 */ if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) { pr_debug("Drop incomplete fragment\n"); return 0; } if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) return -ENOMEM; skb_set_transport_header(skb, fhoff); hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); rcu_read_lock(); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { rcu_read_unlock(); pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; } spin_lock_bh(&fq->q.lock); ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; } spin_unlock_bh(&fq->q.lock); rcu_read_unlock(); inet_frag_putn(&fq->q, refs); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); int res; res = fqdir_init(&nf_frag->fqdir, &nf_frags, net); if (res < 0) return res; nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = nf_ct_frag6_sysctl_register(net); if (res < 0) fqdir_exit(nf_frag->fqdir); return res; } static void nf_ct_net_pre_exit(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); fqdir_pre_exit(nf_frag->fqdir); } static void nf_ct_net_exit(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); nf_ct_frags6_sysctl_unregister(net); fqdir_exit(nf_frag->fqdir); } static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .pre_exit = nf_ct_net_pre_exit, .exit = nf_ct_net_exit, .id = &nf_frag_pernet_id, .size = sizeof(struct nft_ct_frag6_pernet), }; static const struct rhashtable_params nfct_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = ip6frag_key_hashfn, .obj_hashfn = ip6frag_obj_hashfn, .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; int nf_ct_frag6_init(void) { int ret = 0; nf_frags.constructor = ip6frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; nf_frags.rhash_params = nfct_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; ret = register_pernet_subsys(&nf_ct_net_ops); if (ret) inet_frags_fini(&nf_frags); out: return ret; } void nf_ct_frag6_cleanup(void) { unregister_pernet_subsys(&nf_ct_net_ops); inet_frags_fini(&nf_frags); }
574 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") ); TRACE_EVENT(mm_lru_activate, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); ), TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
33 29 33 7 3 4 3 3 7 3 3 4 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 // SPDX-License-Identifier: GPL-2.0 /* * Central processing for nfsd. * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched/signal.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> #include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/nfslocalio.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" #include "vfs.h" #include "netns.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_SVC atomic_t nfsd_th_cnt = ATOMIC_INIT(0); static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static int nfsd_acl_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_acl_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); #endif static int nfsd_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); /* * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in * nfsctl.c. In particular: * * user_recovery_dirname * user_lease_time * nfsd_versions */ DEFINE_MUTEX(nfsd_mutex); #if IS_ENABLED(CONFIG_NFS_LOCALIO) static const struct svc_version *localio_versions[] = { [1] = &localio_version1, }; #define NFSD_LOCALIO_NRVERS ARRAY_SIZE(localio_versions) #endif /* CONFIG_NFS_LOCALIO */ #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, # endif # if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, # endif }; #define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[NFSD_MAXVERS+1] = { #if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, #endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, #endif }; struct svc_program nfsd_programs[] = { { .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_MAXVERS+1, /* nr of entries in nfsd_version */ .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ .pg_authenticate = svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, }, #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }, #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ #if IS_ENABLED(CONFIG_NFS_LOCALIO) { .pg_prog = NFS_LOCALIO_PROGRAM, .pg_nvers = NFSD_LOCALIO_NRVERS, .pg_vers = localio_versions, .pg_name = "nfslocalio", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, } #endif /* CONFIG_NFS_LOCALIO */ }; bool nfsd_support_version(int vers) { if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS) return nfsd_version[vers] != NULL; return false; } int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers > NFSD_MAXVERS) return 0; switch(change) { case NFSD_SET: nn->nfsd_versions[vers] = nfsd_support_version(vers); break; case NFSD_CLEAR: nn->nfsd_versions[vers] = false; break; case NFSD_TEST: return nn->nfsd_versions[vers]; case NFSD_AVAIL: return nfsd_support_version(vers); } return 0; } static void nfsd_adjust_nfsd_versions4(struct nfsd_net *nn) { unsigned i; for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { if (nn->nfsd4_minorversions[i]) return; } nfsd_vers(nn, 4, NFSD_CLEAR); } int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && change != NFSD_AVAIL) return -1; switch(change) { case NFSD_SET: nfsd_vers(nn, 4, NFSD_SET); nn->nfsd4_minorversions[minorversion] = nfsd_vers(nn, 4, NFSD_TEST); break; case NFSD_CLEAR: nn->nfsd4_minorversions[minorversion] = false; nfsd_adjust_nfsd_versions4(nn); break; case NFSD_TEST: return nn->nfsd4_minorversions[minorversion]; case NFSD_AVAIL: return minorversion <= NFSD_SUPPORTED_MINOR_VERSION && nfsd_vers(nn, 4, NFSD_AVAIL); } return 0; } bool nfsd_net_try_get(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return (nn && percpu_ref_tryget_live(&nn->nfsd_net_ref)); } void nfsd_net_put(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); percpu_ref_put(&nn->nfsd_net_ref); } static void nfsd_net_done(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); complete(&nn->nfsd_net_confirm_done); } static void nfsd_net_free(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); complete(&nn->nfsd_net_free_done); } /* * Maximum number of nfsd processes */ #define NFSD_MAXSERVS 8192 int nfsd_nrthreads(struct net *net) { int rv = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) rv = nn->nfsd_serv->sv_nrthreads; mutex_unlock(&nfsd_mutex); return rv; } static int nfsd_users = 0; static int nfsd_startup_generic(void) { int ret; if (nfsd_users++) return 0; ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) goto out_file_cache; return 0; out_file_cache: nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; } static void nfsd_shutdown_generic(void) { if (--nfsd_users) return; nfs4_state_shutdown(); nfsd_file_cache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) { return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } /** * nfsd_copy_write_verifier - Atomically copy a write verifier * @verf: buffer in which to receive the verifier cookie * @nn: NFS net namespace * * This function provides a wait-free mechanism for copying the * namespace's write verifier without tearing it. */ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { unsigned int seq; do { seq = read_seqbegin(&nn->writeverf_lock); memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); } while (read_seqretry(&nn->writeverf_lock, seq)); } static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { struct timespec64 now; u64 verf; /* * Because the time value is hashed, y2038 time_t overflow * is irrelevant in this usage. */ ktime_get_raw_ts64(&now); verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } /** * nfsd_reset_write_verifier - Generate a new write verifier * @nn: NFS net namespace * * This function updates the ->writeverf field of @nn. This field * contains an opaque cookie that, according to Section 18.32.3 of * RFC 8881, "the client can use to determine whether a server has * changed instance state (e.g., server restart) between a call to * WRITE and a subsequent call to either WRITE or COMMIT. This * cookie MUST be unchanged during a single instance of the NFSv4.1 * server and MUST be unique between instances of the NFSv4.1 * server." */ void nfsd_reset_write_verifier(struct nfsd_net *nn) { write_seqlock(&nn->writeverf_lock); nfsd_reset_write_verifier_locked(nn); write_sequnlock(&nn->writeverf_lock); } /* * Crank up a set of per-namespace resources for a new NFSD instance, * including lockd, a duplicate reply cache, an open file cache * instance, and a cache of NFSv4 state objects. */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; if (nn->nfsd_net_up) return 0; ret = nfsd_startup_generic(); if (ret) return ret; if (list_empty(&nn->nfsd_serv->sv_permsocks)) { pr_warn("NFSD: Failed to start, no listeners configured.\n"); ret = -EIO; goto out_socks; } if (nfsd_needs_lockd(nn) && !nn->lockd_up) { ret = lockd_up(net, cred); if (ret) goto out_socks; nn->lockd_up = true; } ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; ret = nfsd_reply_cache_init(nn); if (ret) goto out_filecache; #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_init_umount_work(nn); #endif ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; nn->nfsd_net_up = true; return 0; out_reply_cache: nfsd_reply_cache_shutdown(nn); out_filecache: nfsd_file_cache_shutdown_net(net); out_lockd: if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } out_socks: nfsd_shutdown_generic(); return ret; } static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->nfsd_net_up) { percpu_ref_kill_and_confirm(&nn->nfsd_net_ref, nfsd_net_done); wait_for_completion(&nn->nfsd_net_confirm_done); nfsd_export_flush(net); nfs4_state_shutdown_net(net); nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } wait_for_completion(&nn->nfsd_net_free_done); } percpu_ref_exit(&nn->nfsd_net_ref); if (nn->nfsd_net_up) nfsd_shutdown_generic(); nn->nfsd_net_up = false; } static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inetaddr_notifier = { .notifier_call = nfsd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nfsd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct net_device *dev = ifa->idev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inet6addr_notifier = { .notifier_call = nfsd_inet6addr_event, }; #endif /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); /** * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace * @net: network namespace the NFS service is associated with */ void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; lockdep_assert_held(&nfsd_mutex); spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = NULL; spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } /* * write_ports can create the server without actually starting * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } void nfsd_reset_versions(struct nfsd_net *nn) { int i; for (i = 0; i <= NFSD_MAXVERS; i++) if (nfsd_vers(nn, i, NFSD_TEST)) return; for (i = 0; i <= NFSD_MAXVERS; i++) if (i != 4) nfsd_vers(nn, i, NFSD_SET); else { int minor = 0; while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0) minor++; } } static int nfsd_get_default_max_blksize(void) { struct sysinfo i; unsigned long long target; unsigned long ret; si_meminfo(&i); target = (i.totalram - i.totalhigh) << PAGE_SHIFT; /* * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig * machines, but only uses 32K on 128M machines. Bottom out at * 8K on 32M and smaller. Of course, this is only a default. */ target >>= 12; ret = NFSSVC_DEFBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; } void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; mutex_lock(&nfsd_mutex); serv = nn->nfsd_serv; if (serv == NULL) { mutex_unlock(&nfsd_mutex); return; } /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); nfsd_destroy_serv(net); mutex_unlock(&nfsd_mutex); } struct svc_rqst *nfsd_current_rqst(void) { if (kthread_func(current) == nfsd) return kthread_data(current); return NULL; } int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nn->nfsd_serv) return 0; error = percpu_ref_init(&nn->nfsd_net_ref, nfsd_net_free, 0, GFP_KERNEL); if (error) return error; init_completion(&nn->nfsd_net_free_done); init_completion(&nn->nfsd_net_confirm_done); if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); serv = svc_create_pooled(nfsd_programs, ARRAY_SIZE(nfsd_programs), &nn->nfsd_svcstats, nfsd_max_blksize, nfsd); if (serv == NULL) { percpu_ref_exit(&nn->nfsd_net_ref); return -ENOMEM; } error = svc_bind(serv, net); if (error < 0) { svc_destroy(&serv); percpu_ref_exit(&nn->nfsd_net_ref); return error; } spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); /* check if the notifier is already set */ if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } nfsd_reset_write_verifier(nn); return 0; } int nfsd_nrpools(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->nfsd_serv == NULL) return 0; else return nn->nfsd_serv->sv_nrpools; } int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; int i; if (serv) for (i = 0; i < serv->sv_nrpools && i < n; i++) nthreads[i] = serv->sv_pools[i].sp_nrthreads; return 0; } /** * nfsd_set_nrthreads - set the number of running threads in the net's service * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * * This function alters the number of running threads for the given network * namespace in each pool. If passed an array longer then the number of pools * the extra pool settings are ignored. If passed an array shorter than the * number of pools, the missing values are interpreted as 0's. * * Returns 0 on success or a negative errno on error. */ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; int tot = 0; int err = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); lockdep_assert_held(&nfsd_mutex); if (nn->nfsd_serv == NULL || n <= 0) return 0; /* * Special case: When n == 1, pass in NULL for the pool, so that the * change is distributed equally among them. */ if (n == 1) return svc_set_num_threads(nn->nfsd_serv, NULL, nthreads[0]); if (n > nn->nfsd_serv->sv_nrpools) n = nn->nfsd_serv->sv_nrpools; /* enforce a global maximum number of threads */ tot = 0; for (i = 0; i < n; i++) { nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); tot += nthreads[i]; } if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } for (i = 0; i < n && tot > 0; i++) { nthreads[i]--; tot--; } } /* apply the new numbers */ for (i = 0; i < n; i++) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], nthreads[i]); if (err) goto out; } /* Anything undefined in array is considered to be 0 */ for (i = n; i < nn->nfsd_serv->sv_nrpools; ++i) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], 0); if (err) goto out; } out: return err; } /** * nfsd_svc: start up or shut down the nfsd server * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * @cred: credentials to use for xprt creation * @scope: server scope value (defaults to nodename) * * Adjust the number of threads in each pool and return the new * total number of threads in the service. */ int nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const char *scope) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; lockdep_assert_held(&nfsd_mutex); dprintk("nfsd: creating service\n"); strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename, sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); if (error) goto out; serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; error = nfsd_set_nrthreads(n, nthreads, net); if (error) goto out_put; error = serv->sv_nrthreads; out_put: if (serv->sv_nrthreads == 0) nfsd_destroy_serv(net); out: return error; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static bool nfsd_support_acl_version(int vers) { if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS) return nfsd_acl_version[vers] != NULL; return false; } static int nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_support_acl_version(version) || !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_acl_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_ACL_NRVERS; for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers == NFSD_ACL_NRVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_ACL_MINVERS; for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } #endif static int nfsd_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_MAXVERS + 1; for (i = NFSD_MINVERS; i <= NFSD_MAXVERS; i++) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers > NFSD_MAXVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_MINVERS; for (i = NFSD_MAXVERS; i >= NFSD_MINVERS; i--) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } /* * This is the NFS server kernel thread */ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); /* At this point, the thread shares current->fs * with the init process. We need to create files with the * umask as defined by the client instead of init's umask. */ svc_thread_init_status(rqstp, unshare_fs_struct()); current->fs->umask = 0; atomic_inc(&nfsd_th_cnt); set_freezable(); /* * The main request loop */ while (!svc_thread_should_stop(rqstp)) { svc_recv(rqstp); nfsd_file_net_dispose(nn); } atomic_dec(&nfsd_th_cnt); /* Release the thread */ svc_exit_thread(rqstp); return 0; } /** * nfsd_dispatch - Process an NFS or NFSACL or LOCALIO Request * @rqstp: incoming request * * This RPC dispatcher integrates the NFS server's duplicate reply cache. * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; unsigned int start, len; __be32 *nfs_reply; /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ rqstp->rq_cachetype = proc->pc_cachetype; /* * ->pc_decode advances the argument stream past the NFS * Call header, so grab the header's starting location and * size now for the call to nfsd_cache_lookup(). */ start = xdr_stream_pos(&rqstp->rq_arg_stream); len = xdr_stream_remaining(&rqstp->rq_arg_stream); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; /* * Release rq_status_counter setting it to an odd value after the rpc * request has been properly parsed. rq_status_counter is used to * notify the consumers if the rqstp fields are stable * (rq_status_counter is odd) or not meaningful (rq_status_counter * is even). */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); rp = NULL; switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { case RC_DOIT: break; case RC_REPLY: goto out_cached_reply; case RC_DROPIT: goto out_dropit; } nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; /* * Release rq_status_counter setting it to an even value after the rpc * request has been properly processed. */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); out_cached_reply: return 1; out_decode_err: trace_nfsd_garbage_args_err(rqstp); *statp = rpc_garbage_args; return 1; out_update_drop: nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } /** * nfssvc_decode_voidarg - Decode void arguments * @rqstp: Server RPC transaction context * @xdr: XDR stream positioned at arguments to decode * * Return values: * %false: Arguments were not valid * %true: Decoding was successful */ bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; } /** * nfssvc_encode_voidres - Encode void results * @rqstp: Server RPC transaction context * @xdr: XDR stream into which to encode results * * Return values: * %false: Local error while encoding * %true: Encoding was successful */ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; }
13 13 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 // SPDX-License-Identifier: GPL-2.0 /* * devtmpfs - kernel-maintained tmpfs-based /dev * * Copyright (C) 2009, Kay Sievers <kay.sievers@vrfy.org> * * During bootup, before any driver core device is registered, * devtmpfs, a tmpfs-based filesystem is created. Every driver-core * device which requests a device node, will add a node in this * filesystem. * By default, all devices are named after the name of the device, * owned by root and have a default mode of 0600. Subsystems can * overwrite the default setting if needed. */ #define pr_fmt(fmt) "devtmpfs: " fmt #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/device.h> #include <linux/blkdev.h> #include <linux/namei.h> #include <linux/fs.h> #include <linux/shmem_fs.h> #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/init_syscalls.h> #include <uapi/linux/mount.h> #include "base.h" #ifdef CONFIG_DEVTMPFS_SAFE #define DEVTMPFS_MFLAGS (MS_SILENT | MS_NOEXEC | MS_NOSUID) #else #define DEVTMPFS_MFLAGS (MS_SILENT) #endif static struct task_struct *thread; static int __initdata mount_dev = IS_ENABLED(CONFIG_DEVTMPFS_MOUNT); static DEFINE_SPINLOCK(req_lock); static struct req { struct req *next; struct completion done; int err; const char *name; umode_t mode; /* 0 => delete */ kuid_t uid; kgid_t gid; struct device *dev; } *requests; static int __init mount_param(char *str) { mount_dev = simple_strtoul(str, NULL, 0); return 1; } __setup("devtmpfs.mount=", mount_param); static struct vfsmount *mnt; static struct file_system_type internal_fs_type = { .name = "devtmpfs", #ifdef CONFIG_TMPFS .init_fs_context = shmem_init_fs_context, #else .init_fs_context = ramfs_init_fs_context, #endif .kill_sb = kill_anon_super, }; /* Simply take a ref on the existing mount */ static int devtmpfs_get_tree(struct fs_context *fc) { struct super_block *sb = mnt->mnt_sb; atomic_inc(&sb->s_active); down_write(&sb->s_umount); fc->root = dget(sb->s_root); return 0; } /* Ops are filled in during init depending on underlying shmem or ramfs type */ struct fs_context_operations devtmpfs_context_ops = {}; /* Call the underlying initialization and set to our ops */ static int devtmpfs_init_fs_context(struct fs_context *fc) { int ret; #ifdef CONFIG_TMPFS ret = shmem_init_fs_context(fc); #else ret = ramfs_init_fs_context(fc); #endif if (ret < 0) return ret; fc->ops = &devtmpfs_context_ops; return 0; } static struct file_system_type dev_fs_type = { .name = "devtmpfs", .init_fs_context = devtmpfs_init_fs_context, }; static int devtmpfs_submit_req(struct req *req, const char *tmp) { init_completion(&req->done); spin_lock(&req_lock); req->next = requests; requests = req; spin_unlock(&req_lock); wake_up_process(thread); wait_for_completion(&req->done); kfree(tmp); return req->err; } int devtmpfs_create_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.mode = 0; req.uid = GLOBAL_ROOT_UID; req.gid = GLOBAL_ROOT_GID; req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp); if (!req.name) return -ENOMEM; if (req.mode == 0) req.mode = 0600; if (is_blockdev(dev)) req.mode |= S_IFBLK; else req.mode |= S_IFCHR; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } int devtmpfs_delete_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp); if (!req.name) return -ENOMEM; req.mode = 0; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } static int dev_mkdir(const char *name, umode_t mode) { struct dentry *dentry; struct path path; dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, NULL); if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; end_creating_path(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } static int create_path(const char *nodepath) { char *path; char *s; int err = 0; /* parent directories do not exist, create them */ path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; s = path; for (;;) { s = strchr(s, '/'); if (!s) break; s[0] = '\0'; err = dev_mkdir(path, 0755); if (err && err != -EEXIST) break; s[0] = '/'; s++; } kfree(path); return err; } static int handle_create(const char *nodename, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { struct dentry *dentry; struct path path; int err; dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, dev->devt, NULL); if (!err) { struct iattr newattrs; newattrs.ia_mode = mode; newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } end_creating_path(&path, dentry); return err; } static int dev_rmdir(const char *name) { struct path parent; struct dentry *dentry; int err; dentry = start_removing_path(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); else err = -EPERM; end_removing_path(&parent, dentry); return err; } static int delete_path(const char *nodepath) { char *path; int err = 0; path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; for (;;) { char *base; base = strrchr(path, '/'); if (!base) break; base[0] = '\0'; err = dev_rmdir(path); if (err) break; } kfree(path); return err; } static int dev_mynode(struct device *dev, struct inode *inode) { /* did we create it */ if (inode->i_private != &thread) return 0; /* does the dev_t match */ if (is_blockdev(dev)) { if (!S_ISBLK(inode->i_mode)) return 0; } else { if (!S_ISCHR(inode->i_mode)) return 0; } if (inode->i_rdev != dev->devt) return 0; /* ours */ return 1; } static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; struct inode *inode; int deleted = 0; int err = 0; dentry = start_removing_path(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); inode = d_inode(dentry); if (dev_mynode(dev, inode)) { struct iattr newattrs; /* * before unlinking this node, reset permissions * of possible references like hardlinks */ newattrs.ia_uid = GLOBAL_ROOT_UID; newattrs.ia_gid = GLOBAL_ROOT_GID; newattrs.ia_mode = inode->i_mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; } end_removing_path(&parent, dentry); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; } /* * If configured, or requested by the commandline, devtmpfs will be * auto-mounted after the kernel mounted the root filesystem. */ int __init devtmpfs_mount(void) { int err; if (!mount_dev) return 0; if (!thread) return 0; err = init_mount("devtmpfs", "dev", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) pr_info("error mounting %d\n", err); else pr_info("mounted\n"); return err; } static __initdata DECLARE_COMPLETION(setup_done); static int handle(const char *name, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { if (mode) return handle_create(name, mode, uid, gid, dev); else return handle_remove(name, dev); } static void __noreturn devtmpfs_work_loop(void) { while (1) { spin_lock(&req_lock); while (requests) { struct req *req = requests; requests = NULL; spin_unlock(&req_lock); while (req) { struct req *next = req->next; req->err = handle(req->name, req->mode, req->uid, req->gid, req->dev); complete(&req->done); req = next; } spin_lock(&req_lock); } __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&req_lock); schedule(); } } static noinline int __init devtmpfs_setup(void *p) { int err; err = ksys_unshare(CLONE_NEWNS); if (err) goto out; err = init_mount("devtmpfs", "/", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) goto out; init_chdir("/.."); /* will traverse into overmounted root */ init_chroot("."); out: *(int *)p = err; return err; } /* * The __ref is because devtmpfs_setup needs to be __init for the routines it * calls. That call is done while devtmpfs_init, which is marked __init, * synchronously waits for it to complete. */ static int __ref devtmpfsd(void *p) { int err = devtmpfs_setup(p); complete(&setup_done); if (err) return err; devtmpfs_work_loop(); return 0; } /* * Get the underlying (shmem/ramfs) context ops to build ours */ static int devtmpfs_configure_context(void) { struct fs_context *fc; fc = fs_context_for_reconfigure(mnt->mnt_root, mnt->mnt_sb->s_flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); /* Set up devtmpfs_context_ops based on underlying type */ devtmpfs_context_ops.free = fc->ops->free; devtmpfs_context_ops.dup = fc->ops->dup; devtmpfs_context_ops.parse_param = fc->ops->parse_param; devtmpfs_context_ops.parse_monolithic = fc->ops->parse_monolithic; devtmpfs_context_ops.get_tree = &devtmpfs_get_tree; devtmpfs_context_ops.reconfigure = fc->ops->reconfigure; put_fs_context(fc); return 0; } /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. */ int __init devtmpfs_init(void) { char opts[] = "mode=0755"; int err; mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts); if (IS_ERR(mnt)) { pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt)); return PTR_ERR(mnt); } err = devtmpfs_configure_context(); if (err) { pr_err("unable to configure devtmpfs type %d\n", err); return err; } err = register_filesystem(&dev_fs_type); if (err) { pr_err("unable to register devtmpfs type %d\n", err); return err; } thread = kthread_run(devtmpfsd, &err, "kdevtmpfs"); if (!IS_ERR(thread)) { wait_for_completion(&setup_done); } else { err = PTR_ERR(thread); thread = NULL; } if (err) { pr_err("unable to create devtmpfs %d\n", err); unregister_filesystem(&dev_fs_type); thread = NULL; return err; } pr_info("initialized\n"); return 0; }
2 2 2 5 5 5 5 5 5 2 2 1 4 5 13 2 5 4 2 8 10 1 8 1 3 2 3 12 1 1 11 7 7 7 4 1 3 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 // SPDX-License-Identifier: GPL-2.0-only /* * File: pep.c * * Phonet pipe protocol end point socket * * Copyright (C) 2008 Nokia Corporation. * * Author: Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/socket.h> #include <net/sock.h> #include <net/tcp_states.h> #include <asm/ioctls.h> #include <linux/phonet.h> #include <linux/module.h> #include <net/phonet/phonet.h> #include <net/phonet/pep.h> #include <net/phonet/gprs.h> /* sk_state values: * TCP_CLOSE sock not in use yet * TCP_CLOSE_WAIT disconnected pipe * TCP_LISTEN listening pipe endpoint * TCP_SYN_RECV connected pipe in disabled state * TCP_ESTABLISHED connected pipe in enabled state * * pep_sock locking: * - sk_state, hlist: sock lock needed * - listener: read only * - pipe_handle: read only */ #define CREDITS_MAX 10 #define CREDITS_THR 7 #define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */ /* Get the next TLV sub-block. */ static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen, void *buf) { void *data = NULL; struct { u8 sb_type; u8 sb_len; } *ph, h; int buflen = *plen; ph = skb_header_pointer(skb, 0, 2, &h); if (ph == NULL || ph->sb_len < 2 || !pskb_may_pull(skb, ph->sb_len)) return NULL; ph->sb_len -= 2; *ptype = ph->sb_type; *plen = ph->sb_len; if (buflen > ph->sb_len) buflen = ph->sb_len; data = skb_header_pointer(skb, 2, buflen, buf); __skb_pull(skb, 2 + ph->sb_len); return data; } static struct sk_buff *pep_alloc_skb(struct sock *sk, const void *payload, int len, gfp_t priority) { struct sk_buff *skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority); if (!skb) return NULL; skb_set_owner_w(skb, sk); skb_reserve(skb, MAX_PNPIPE_HEADER); __skb_put(skb, len); skb_copy_to_linear_data(skb, payload, len); __skb_push(skb, sizeof(struct pnpipehdr)); skb_reset_transport_header(skb); return skb; } static int pep_reply(struct sock *sk, struct sk_buff *oskb, u8 code, const void *data, int len, gfp_t priority) { const struct pnpipehdr *oph = pnp_hdr(oskb); struct pnpipehdr *ph; struct sk_buff *skb; struct sockaddr_pn peer; skb = pep_alloc_skb(sk, data, len, priority); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = oph->utid; ph->message_id = oph->message_id + 1; /* REQ -> RESP */ ph->pipe_handle = oph->pipe_handle; ph->error_code = code; pn_skb_get_src_sockaddr(oskb, &peer); return pn_skb_send(sk, skb, &peer); } static int pep_indicate(struct sock *sk, u8 id, u8 code, const void *data, int len, gfp_t priority) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; skb = pep_alloc_skb(sk, data, len, priority); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = 0; ph->message_id = id; ph->pipe_handle = pn->pipe_handle; ph->error_code = code; return pn_skb_send(sk, skb, NULL); } #define PAD 0x00 static int pipe_handler_request(struct sock *sk, u8 id, u8 code, const void *data, int len) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; skb = pep_alloc_skb(sk, data, len, GFP_KERNEL); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = id; /* whatever */ ph->message_id = id; ph->pipe_handle = pn->pipe_handle; ph->error_code = code; return pn_skb_send(sk, skb, NULL); } static int pipe_handler_send_created_ind(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); u8 data[4] = { PN_PIPE_SB_NEGOTIATED_FC, pep_sb_size(2), pn->tx_fc, pn->rx_fc, }; return pep_indicate(sk, PNS_PIPE_CREATED_IND, 1 /* sub-blocks */, data, 4, GFP_ATOMIC); } static int pep_accept_conn(struct sock *sk, struct sk_buff *skb) { static const u8 data[20] = { PAD, PAD, PAD, 2 /* sub-blocks */, PN_PIPE_SB_REQUIRED_FC_TX, pep_sb_size(5), 3, PAD, PN_MULTI_CREDIT_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PAD, PN_PIPE_SB_PREFERRED_FC_RX, pep_sb_size(5), 3, PAD, PN_MULTI_CREDIT_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PAD, }; might_sleep(); return pep_reply(sk, skb, PN_PIPE_NO_ERROR, data, sizeof(data), GFP_KERNEL); } static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code, gfp_t priority) { static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ }; WARN_ON(code == PN_PIPE_NO_ERROR); return pep_reply(sk, skb, code, data, sizeof(data), priority); } /* Control requests are not sent by the pipe service and have a specific * message format. */ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code, gfp_t priority) { const struct pnpipehdr *oph = pnp_hdr(oskb); struct sk_buff *skb; struct pnpipehdr *ph; struct sockaddr_pn dst; u8 data[4] = { oph->pep_type, /* PEP type */ code, /* error code, at an unusual offset */ PAD, PAD, }; skb = pep_alloc_skb(sk, data, 4, priority); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = oph->utid; ph->message_id = PNS_PEP_CTRL_RESP; ph->pipe_handle = oph->pipe_handle; ph->data0 = oph->data[0]; /* CTRL id */ pn_skb_get_src_sockaddr(oskb, &dst); return pn_skb_send(sk, skb, &dst); } static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority) { u8 data[4] = { type, PAD, PAD, status }; return pep_indicate(sk, PNS_PEP_STATUS_IND, PN_PEP_TYPE_COMMON, data, 4, priority); } /* Send our RX flow control information to the sender. * Socket must be locked. */ static void pipe_grant_credits(struct sock *sk, gfp_t priority) { struct pep_sock *pn = pep_sk(sk); BUG_ON(sk->sk_state != TCP_ESTABLISHED); switch (pn->rx_fc) { case PN_LEGACY_FLOW_CONTROL: /* TODO */ break; case PN_ONE_CREDIT_FLOW_CONTROL: if (pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL, PEP_IND_READY, priority) == 0) pn->rx_credits = 1; break; case PN_MULTI_CREDIT_FLOW_CONTROL: if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX) break; if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, CREDITS_MAX - pn->rx_credits, priority) == 0) pn->rx_credits = CREDITS_MAX; break; } } static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr; int wake = 0; if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) return -EINVAL; hdr = pnp_hdr(skb); if (hdr->pep_type != PN_PEP_TYPE_COMMON) { net_dbg_ratelimited("Phonet unknown PEP type: %u\n", (unsigned int)hdr->pep_type); return -EOPNOTSUPP; } switch (hdr->data[0]) { case PN_PEP_IND_FLOW_CONTROL: switch (pn->tx_fc) { case PN_LEGACY_FLOW_CONTROL: switch (hdr->data[3]) { case PEP_IND_BUSY: atomic_set(&pn->tx_credits, 0); break; case PEP_IND_READY: atomic_set(&pn->tx_credits, wake = 1); break; } break; case PN_ONE_CREDIT_FLOW_CONTROL: if (hdr->data[3] == PEP_IND_READY) atomic_set(&pn->tx_credits, wake = 1); break; } break; case PN_PEP_IND_ID_MCFC_GRANT_CREDITS: if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL) break; atomic_add(wake = hdr->data[3], &pn->tx_credits); break; default: net_dbg_ratelimited("Phonet unknown PEP indication: %u\n", (unsigned int)hdr->data[0]); return -EOPNOTSUPP; } if (wake) sk->sk_write_space(sk); return 0; } static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); u8 n_sb = hdr->data0; pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; __skb_pull(skb, sizeof(*hdr)); while (n_sb > 0) { u8 type, buf[2], len = sizeof(buf); u8 *data = pep_get_sb(skb, &type, &len, buf); if (data == NULL) return -EINVAL; switch (type) { case PN_PIPE_SB_NEGOTIATED_FC: if (len < 2 || (data[0] | data[1]) > 3) break; pn->tx_fc = data[0] & 3; pn->rx_fc = data[1] & 3; break; } n_sb--; } return 0; } /* Queue an skb to a connected sock. * Socket lock must be held. */ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); struct sk_buff_head *queue; int err = 0; BUG_ON(sk->sk_state == TCP_CLOSE_WAIT); switch (hdr->message_id) { case PNS_PEP_CONNECT_REQ: pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC); break; case PNS_PEP_DISCONNECT_REQ: pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); sk->sk_state = TCP_CLOSE_WAIT; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); break; case PNS_PEP_ENABLE_REQ: /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */ pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; case PNS_PEP_RESET_REQ: switch (hdr->state_after_reset) { case PN_PIPE_DISABLE: pn->init_enable = 0; break; case PN_PIPE_ENABLE: pn->init_enable = 1; break; default: /* not allowed to send an error here!? */ err = -EINVAL; goto out; } fallthrough; case PNS_PEP_DISABLE_REQ: atomic_set(&pn->tx_credits, 0); pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { sk_drops_inc(sk); break; } __skb_pull(skb, 4); queue = &pn->ctrlreq_queue; goto queue; case PNS_PIPE_ALIGNED_DATA: __skb_pull(skb, 1); fallthrough; case PNS_PIPE_DATA: __skb_pull(skb, 3); /* Pipe data header */ if (!pn_flow_safe(pn->rx_fc)) { err = sock_queue_rcv_skb(sk, skb); if (!err) return NET_RX_SUCCESS; err = -ENOBUFS; break; } if (pn->rx_credits == 0) { sk_drops_inc(sk); err = -ENOBUFS; break; } pn->rx_credits--; queue = &sk->sk_receive_queue; goto queue; case PNS_PEP_STATUS_IND: pipe_rcv_status(sk, skb); break; case PNS_PIPE_REDIRECTED_IND: err = pipe_rcv_created(sk, skb); break; case PNS_PIPE_CREATED_IND: err = pipe_rcv_created(sk, skb); if (err) break; fallthrough; case PNS_PIPE_RESET_IND: if (!pn->init_enable) break; fallthrough; case PNS_PIPE_ENABLED_IND: if (!pn_flow_safe(pn->tx_fc)) { atomic_set(&pn->tx_credits, 1); sk->sk_write_space(sk); } if (sk->sk_state == TCP_ESTABLISHED) break; /* Nothing to do */ sk->sk_state = TCP_ESTABLISHED; pipe_grant_credits(sk, GFP_ATOMIC); break; case PNS_PIPE_DISABLED_IND: sk->sk_state = TCP_SYN_RECV; pn->rx_credits = 0; break; default: net_dbg_ratelimited("Phonet unknown PEP message: %u\n", hdr->message_id); err = -EINVAL; } out: kfree_skb(skb); return (err == -ENOBUFS) ? NET_RX_DROP : NET_RX_SUCCESS; queue: skb->dev = NULL; skb_set_owner_r(skb, sk); skb_queue_tail(queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return NET_RX_SUCCESS; } /* Destroy connected sock. */ static void pipe_destruct(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&pn->ctrlreq_queue); } static u8 pipe_negotiate_fc(const u8 *fcs, unsigned int n) { unsigned int i; u8 final_fc = PN_NO_FLOW_CONTROL; for (i = 0; i < n; i++) { u8 fc = fcs[i]; if (fc > final_fc && fc < PN_MAX_FLOW_CONTROL) final_fc = fc; } return final_fc; } static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr; u8 n_sb; if (!pskb_pull(skb, sizeof(*hdr) + 4)) return -EINVAL; hdr = pnp_hdr(skb); if (hdr->error_code != PN_PIPE_NO_ERROR) return -ECONNREFUSED; /* Parse sub-blocks */ n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[6], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); if (data == NULL) return -EINVAL; switch (type) { case PN_PIPE_SB_REQUIRED_FC_TX: if (len < 2 || len < data[0]) break; pn->tx_fc = pipe_negotiate_fc(data + 2, len - 2); break; case PN_PIPE_SB_PREFERRED_FC_RX: if (len < 2 || len < data[0]) break; pn->rx_fc = pipe_negotiate_fc(data + 2, len - 2); break; } n_sb--; } return pipe_handler_send_created_ind(sk); } static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb) { struct pnpipehdr *hdr = pnp_hdr(skb); if (hdr->error_code != PN_PIPE_NO_ERROR) return -ECONNREFUSED; return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */, NULL, 0, GFP_ATOMIC); } static void pipe_start_flow_control(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); if (!pn_flow_safe(pn->tx_fc)) { atomic_set(&pn->tx_credits, 1); sk->sk_write_space(sk); } pipe_grant_credits(sk, GFP_ATOMIC); } /* Queue an skb to an actively connected sock. * Socket lock must be held. */ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); int err = NET_RX_SUCCESS; switch (hdr->message_id) { case PNS_PIPE_ALIGNED_DATA: __skb_pull(skb, 1); fallthrough; case PNS_PIPE_DATA: __skb_pull(skb, 3); /* Pipe data header */ if (!pn_flow_safe(pn->rx_fc)) { err = sock_queue_rcv_skb(sk, skb); if (!err) return NET_RX_SUCCESS; err = NET_RX_DROP; break; } if (pn->rx_credits == 0) { sk_drops_inc(sk); err = NET_RX_DROP; break; } pn->rx_credits--; skb->dev = NULL; skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_CONNECT_RESP: if (sk->sk_state != TCP_SYN_SENT) break; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); if (pep_connresp_rcv(sk, skb)) { sk->sk_state = TCP_CLOSE_WAIT; break; } if (pn->init_enable == PN_PIPE_DISABLE) sk->sk_state = TCP_SYN_RECV; else { sk->sk_state = TCP_ESTABLISHED; pipe_start_flow_control(sk); } break; case PNS_PEP_ENABLE_RESP: if (sk->sk_state != TCP_SYN_SENT) break; if (pep_enableresp_rcv(sk, skb)) { sk->sk_state = TCP_CLOSE_WAIT; break; } sk->sk_state = TCP_ESTABLISHED; pipe_start_flow_control(sk); break; case PNS_PEP_DISCONNECT_RESP: /* sock should already be dead, nothing to do */ break; case PNS_PEP_STATUS_IND: pipe_rcv_status(sk, skb); break; } kfree_skb(skb); return err; } /* Listening sock must be locked */ static struct sock *pep_find_pipe(const struct hlist_head *hlist, const struct sockaddr_pn *dst, u8 pipe_handle) { struct sock *sknode; u16 dobj = pn_sockaddr_get_object(dst); sk_for_each(sknode, hlist) { struct pep_sock *pnnode = pep_sk(sknode); /* Ports match, but addresses might not: */ if (pnnode->pn_sk.sobject != dobj) continue; if (pnnode->pipe_handle != pipe_handle) continue; if (sknode->sk_state == TCP_CLOSE_WAIT) continue; sock_hold(sknode); return sknode; } return NULL; } /* * Deliver an skb to a listening sock. * Socket lock must be held. * We then queue the skb to the right connected sock (if any). */ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct sock *sknode; struct pnpipehdr *hdr; struct sockaddr_pn dst; u8 pipe_handle; if (!pskb_may_pull(skb, sizeof(*hdr))) goto drop; hdr = pnp_hdr(skb); pipe_handle = hdr->pipe_handle; if (pipe_handle == PN_PIPE_INVALID_HANDLE) goto drop; pn_skb_get_dst_sockaddr(skb, &dst); /* Look for an existing pipe handle */ sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle); if (sknode) return sk_receive_skb(sknode, skb, 1); switch (hdr->message_id) { case PNS_PEP_CONNECT_REQ: if (sk->sk_state != TCP_LISTEN || sk_acceptq_is_full(sk)) { pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC); break; } skb_queue_head(&sk->sk_receive_queue, skb); sk_acceptq_added(sk); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_DISCONNECT_REQ: pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; case PNS_PEP_CTRL_REQ: pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE, GFP_ATOMIC); break; case PNS_PEP_RESET_REQ: case PNS_PEP_ENABLE_REQ: case PNS_PEP_DISABLE_REQ: /* invalid handle is not even allowed here! */ break; default: if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN|TCPF_CLOSE_WAIT)) /* actively connected socket */ return pipe_handler_do_rcv(sk, skb); } drop: kfree_skb(skb); return NET_RX_SUCCESS; } static int pipe_do_remove(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; skb = pep_alloc_skb(sk, NULL, 0, GFP_KERNEL); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = 0; ph->message_id = PNS_PIPE_REMOVE_REQ; ph->pipe_handle = pn->pipe_handle; ph->data0 = PAD; return pn_skb_send(sk, skb, NULL); } /* associated socket ceases to exist */ static void pep_sock_close(struct sock *sk, long timeout) { struct pep_sock *pn = pep_sk(sk); int ifindex = 0; sock_hold(sk); /* keep a reference after sk_common_release() */ sk_common_release(sk); lock_sock(sk); if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) { if (sk->sk_backlog_rcv == pipe_do_rcv) /* Forcefully remove dangling Phonet pipe */ pipe_do_remove(sk); else pipe_handler_request(sk, PNS_PEP_DISCONNECT_REQ, PAD, NULL, 0); } sk->sk_state = TCP_CLOSE; ifindex = pn->ifindex; pn->ifindex = 0; release_sock(sk); if (ifindex) gprs_detach(sk); sock_put(sk); } static struct sock *pep_sock_accept(struct sock *sk, struct proto_accept_arg *arg) { struct pep_sock *pn = pep_sk(sk), *newpn; struct sock *newsk = NULL; struct sk_buff *skb; struct pnpipehdr *hdr; struct sockaddr_pn dst, src; int err; u16 peer_type; u8 pipe_handle, enabled, n_sb; u8 aligned = 0; skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, &arg->err); if (!skb) return NULL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto drop; } sk_acceptq_removed(sk); err = -EPROTO; if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) goto drop; hdr = pnp_hdr(skb); pipe_handle = hdr->pipe_handle; switch (hdr->state_after_connect) { case PN_PIPE_DISABLE: enabled = 0; break; case PN_PIPE_ENABLE: enabled = 1; break; default: pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM, GFP_KERNEL); goto drop; } peer_type = hdr->other_pep_type << 8; /* Parse sub-blocks (options) */ n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[1], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); if (data == NULL) goto drop; switch (type) { case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE: if (len < 1) goto drop; peer_type = (peer_type & 0xff00) | data[0]; break; case PN_PIPE_SB_ALIGNED_DATA: aligned = data[0] != 0; break; } n_sb--; } /* Check for duplicate pipe handle */ pn_skb_get_dst_sockaddr(skb, &dst); newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle); if (unlikely(newsk)) { __sock_put(newsk); newsk = NULL; pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_KERNEL); goto drop; } /* Create a new to-be-accepted sock */ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, arg->kern); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; goto drop; } sock_init_data(NULL, newsk); newsk->sk_state = TCP_SYN_RECV; newsk->sk_backlog_rcv = pipe_do_rcv; newsk->sk_protocol = sk->sk_protocol; newsk->sk_destruct = pipe_destruct; newpn = pep_sk(newsk); pn_skb_get_src_sockaddr(skb, &src); newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst); newpn->pn_sk.dobject = pn_sockaddr_get_object(&src); newpn->pn_sk.resource = pn_sockaddr_get_resource(&dst); sock_hold(sk); newpn->listener = sk; skb_queue_head_init(&newpn->ctrlreq_queue); newpn->pipe_handle = pipe_handle; atomic_set(&newpn->tx_credits, 0); newpn->ifindex = 0; newpn->peer_type = peer_type; newpn->rx_credits = 0; newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL; newpn->init_enable = enabled; newpn->aligned = aligned; err = pep_accept_conn(newsk, skb); if (err) { __sock_put(sk); sock_put(newsk); newsk = NULL; goto drop; } sk_add_node(newsk, &pn->hlist); drop: release_sock(sk); kfree_skb(skb); arg->err = err; return newsk; } static int pep_sock_connect(struct sock *sk, struct sockaddr_unsized *addr, int len) { struct pep_sock *pn = pep_sk(sk); int err; u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD }; if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE) pn->pipe_handle = 1; /* anything but INVALID_HANDLE */ err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ, pn->init_enable, data, 4); if (err) { pn->pipe_handle = PN_PIPE_INVALID_HANDLE; return err; } sk->sk_state = TCP_SYN_SENT; return 0; } static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len) { int err; err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD, NULL, 0); if (err) return err; sk->sk_state = TCP_SYN_SENT; return 0; } static unsigned int pep_first_packet_length(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); struct sk_buff_head *q; struct sk_buff *skb; unsigned int len = 0; bool found = false; if (sock_flag(sk, SOCK_URGINLINE)) { q = &pn->ctrlreq_queue; spin_lock_bh(&q->lock); skb = skb_peek(q); if (skb) { len = skb->len; found = true; } spin_unlock_bh(&q->lock); } if (likely(!found)) { q = &sk->sk_receive_queue; spin_lock_bh(&q->lock); skb = skb_peek(q); if (skb) len = skb->len; spin_unlock_bh(&q->lock); } return len; } static int pep_ioctl(struct sock *sk, int cmd, int *karg) { struct pep_sock *pn = pep_sk(sk); int ret = -ENOIOCTLCMD; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } *karg = pep_first_packet_length(sk); ret = 0; break; case SIOCPNENABLEPIPE: lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT) ret = -EBUSY; else if (sk->sk_state == TCP_ESTABLISHED) ret = -EISCONN; else if (!pn->pn_sk.sobject) ret = -EADDRNOTAVAIL; else ret = pep_sock_enable(sk, NULL, 0); release_sock(sk); break; } return ret; } static int pep_init(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); sk->sk_destruct = pipe_destruct; INIT_HLIST_HEAD(&pn->hlist); pn->listener = NULL; skb_queue_head_init(&pn->ctrlreq_queue); atomic_set(&pn->tx_credits, 0); pn->ifindex = 0; pn->peer_type = 0; pn->pipe_handle = PN_PIPE_INVALID_HANDLE; pn->rx_credits = 0; pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; pn->init_enable = 1; pn->aligned = 0; return 0; } static int pep_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct pep_sock *pn = pep_sk(sk); int val = 0, err = 0; if (level != SOL_PNPIPE) return -ENOPROTOOPT; if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; } lock_sock(sk); switch (optname) { case PNPIPE_ENCAP: if (val && val != PNPIPE_ENCAP_IP) { err = -EINVAL; break; } if (!pn->ifindex == !val) break; /* Nothing to do! */ if (!capable(CAP_NET_ADMIN)) { err = -EPERM; break; } if (val) { release_sock(sk); err = gprs_attach(sk); if (err > 0) { pn->ifindex = err; err = 0; } } else { pn->ifindex = 0; release_sock(sk); gprs_detach(sk); err = 0; } goto out_norel; case PNPIPE_HANDLE: if ((sk->sk_state == TCP_CLOSE) && (val >= 0) && (val < PN_PIPE_INVALID_HANDLE)) pn->pipe_handle = val; else err = -EINVAL; break; case PNPIPE_INITSTATE: pn->init_enable = !!val; break; default: err = -ENOPROTOOPT; } release_sock(sk); out_norel: return err; } static int pep_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct pep_sock *pn = pep_sk(sk); int len, val; if (level != SOL_PNPIPE) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; switch (optname) { case PNPIPE_ENCAP: val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE; break; case PNPIPE_IFINDEX: val = pn->ifindex; break; case PNPIPE_HANDLE: val = pn->pipe_handle; if (val == PN_PIPE_INVALID_HANDLE) return -EINVAL; break; case PNPIPE_INITSTATE: val = pn->init_enable; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (put_user(len, optlen)) return -EFAULT; if (put_user(val, (int __user *) optval)) return -EFAULT; return 0; } static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; int err; if (pn_flow_safe(pn->tx_fc) && !atomic_add_unless(&pn->tx_credits, -1, 0)) { kfree_skb(skb); return -ENOBUFS; } skb_push(skb, 3 + pn->aligned); skb_reset_transport_header(skb); ph = pnp_hdr(skb); ph->utid = 0; if (pn->aligned) { ph->message_id = PNS_PIPE_ALIGNED_DATA; ph->data0 = 0; /* padding */ } else ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; err = pn_skb_send(sk, skb, NULL); if (err && pn_flow_safe(pn->tx_fc)) atomic_inc(&pn->tx_credits); return err; } static int pep_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct pep_sock *pn = pep_sk(sk); struct sk_buff *skb; long timeo; int flags = msg->msg_flags; int err, done; if (len > USHRT_MAX) return -EMSGSIZE; if ((msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) || !(msg->msg_flags & MSG_EOR)) return -EOPNOTSUPP; skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len, flags & MSG_DONTWAIT, &err); if (!skb) return err; skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned); err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err < 0) goto outfree; lock_sock(sk); timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & (TCPF_LISTEN|TCPF_CLOSE)) { err = -ENOTCONN; goto out; } if (sk->sk_state != TCP_ESTABLISHED) { /* Wait until the pipe gets to enabled state */ disabled: err = sk_stream_wait_connect(sk, &timeo); if (err) goto out; if (sk->sk_state == TCP_CLOSE_WAIT) { err = -ECONNRESET; goto out; } } BUG_ON(sk->sk_state != TCP_ESTABLISHED); /* Wait until flow control allows TX */ done = atomic_read(&pn->tx_credits); while (!done) { DEFINE_WAIT_FUNC(wait, woken_wake_function); if (!timeo) { err = -EAGAIN; goto out; } if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } add_wait_queue(sk_sleep(sk), &wait); done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits), &wait); remove_wait_queue(sk_sleep(sk), &wait); if (sk->sk_state != TCP_ESTABLISHED) goto disabled; } err = pipe_skb_send(sk, skb); if (err >= 0) err = len; /* success! */ skb = NULL; out: release_sock(sk); outfree: kfree_skb(skb); return err; } int pep_writeable(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); return atomic_read(&pn->tx_credits); } int pep_write(struct sock *sk, struct sk_buff *skb) { struct sk_buff *rskb, *fs; int flen = 0; if (pep_sk(sk)->aligned) return pipe_skb_send(sk, skb); rskb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC); if (!rskb) { kfree_skb(skb); return -ENOMEM; } skb_shinfo(rskb)->frag_list = skb; rskb->len += skb->len; rskb->data_len += rskb->len; rskb->truesize += rskb->len; /* Avoid nested fragments */ skb_walk_frags(skb, fs) flen += fs->len; skb->next = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); skb->len -= flen; skb->data_len -= flen; skb->truesize -= flen; skb_reserve(rskb, MAX_PHONET_HEADER + 3); return pipe_skb_send(sk, rskb); } struct sk_buff *pep_read(struct sock *sk) { struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue); if (sk->sk_state == TCP_ESTABLISHED) pipe_grant_credits(sk, GFP_ATOMIC); return skb; } static int pep_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_buff *skb; int err; if (flags & ~(MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_WAITALL| MSG_NOSIGNAL|MSG_CMSG_COMPAT)) return -EOPNOTSUPP; if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE))) return -ENOTCONN; if ((flags & MSG_OOB) || sock_flag(sk, SOCK_URGINLINE)) { /* Dequeue and acknowledge control request */ struct pep_sock *pn = pep_sk(sk); if (flags & MSG_PEEK) return -EOPNOTSUPP; skb = skb_dequeue(&pn->ctrlreq_queue); if (skb) { pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR, GFP_KERNEL); msg->msg_flags |= MSG_OOB; goto copy; } if (flags & MSG_OOB) return -EINVAL; } skb = skb_recv_datagram(sk, flags, &err); lock_sock(sk); if (skb == NULL) { if (err == -ENOTCONN && sk->sk_state == TCP_CLOSE_WAIT) err = -ECONNRESET; release_sock(sk); return err; } if (sk->sk_state == TCP_ESTABLISHED) pipe_grant_credits(sk, GFP_KERNEL); release_sock(sk); copy: msg->msg_flags |= MSG_EOR; if (skb->len > len) msg->msg_flags |= MSG_TRUNC; else len = skb->len; err = skb_copy_datagram_msg(skb, 0, msg, len); if (!err) err = (flags & MSG_TRUNC) ? skb->len : len; skb_free_datagram(sk, skb); return err; } static void pep_sock_unhash(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); struct sock *skparent = NULL; lock_sock(sk); if (pn->listener != NULL) { skparent = pn->listener; pn->listener = NULL; release_sock(sk); pn = pep_sk(skparent); lock_sock(skparent); sk_del_node_init(sk); sk = skparent; } /* Unhash a listening sock only when it is closed * and all of its active connected pipes are closed. */ if (hlist_empty(&pn->hlist)) pn_sock_unhash(&pn->pn_sk.sk); release_sock(sk); if (skparent) sock_put(skparent); } static struct proto pep_proto = { .close = pep_sock_close, .accept = pep_sock_accept, .connect = pep_sock_connect, .ioctl = pep_ioctl, .init = pep_init, .setsockopt = pep_setsockopt, .getsockopt = pep_getsockopt, .sendmsg = pep_sendmsg, .recvmsg = pep_recvmsg, .backlog_rcv = pep_do_rcv, .hash = pn_sock_hash, .unhash = pep_sock_unhash, .get_port = pn_sock_get_port, .obj_size = sizeof(struct pep_sock), .owner = THIS_MODULE, .name = "PNPIPE", }; static const struct phonet_protocol pep_pn_proto = { .ops = &phonet_stream_ops, .prot = &pep_proto, .sock_type = SOCK_SEQPACKET, }; static int __init pep_register(void) { return phonet_proto_register(PN_PROTO_PIPE, &pep_pn_proto); } static void __exit pep_unregister(void) { phonet_proto_unregister(PN_PROTO_PIPE, &pep_pn_proto); } module_init(pep_register); module_exit(pep_unregister); MODULE_AUTHOR("Remi Denis-Courmont, Nokia"); MODULE_DESCRIPTION("Phonet pipe protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PHONET, PN_PROTO_PIPE);
847 228 2244 638 30 1397 582 36 16 28 59 1191 29 8187 2285 7810 1787 511 511 678 975 2244 1486 446 222 49 250 251 249 13 13 9 72 90 71 3 78 78 1592 304 1590 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> struct list_lru; /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err < 0 ? err : 0; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err < 0 ? err : 0; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err < 0 ? err : 0; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS BITS_TO_LONGS(XA_CHUNK_SIZE) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); unsigned int xas_try_split_min_order(unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline int xas_get_order(struct xa_state *xas) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } static inline void xas_try_split(struct xa_state *xas, void *entry, unsigned int order) { } static inline unsigned int xas_try_split_min_order(unsigned int order) { return 0; } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
30 2 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FUTEX_H #define _FUTEX_H #include <linux/futex.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/cleanup.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> #endif #include <asm/futex.h> /* * Futex flags used to encode options to functions and preserve them across * restarts. */ #define FLAGS_SIZE_8 0x0000 #define FLAGS_SIZE_16 0x0001 #define FLAGS_SIZE_32 0x0002 #define FLAGS_SIZE_64 0x0003 #define FLAGS_SIZE_MASK 0x0003 #ifdef CONFIG_MMU # define FLAGS_SHARED 0x0010 #else /* * NOMMU does not have per process address space. Let the compiler optimize * code away. */ # define FLAGS_SHARED 0x0000 #endif #define FLAGS_CLOCKRT 0x0020 #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 #define FLAGS_MPOL 0x0200 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) { unsigned int flags = FLAGS_SIZE_32; if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; if (op & FUTEX_CLOCK_REALTIME) flags |= FLAGS_CLOCKRT; return flags; } #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) { unsigned int flags = flags2 & FUTEX2_SIZE_MASK; if (!(flags2 & FUTEX2_PRIVATE)) flags |= FLAGS_SHARED; if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; if (flags2 & FUTEX2_MPOL) flags |= FLAGS_MPOL; return flags; } static inline unsigned int futex_size(unsigned int flags) { return 1 << (flags & FLAGS_SIZE_MASK); } static inline bool futex_flags_valid(unsigned int flags) { /* Only 64bit futexes for 64bit code */ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64) return false; } /* Only 32bit futexes are implemented -- for now */ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; /* * Must be able to represent both FUTEX_NO_NODE and every valid nodeid * in a futex word. */ if (flags & FLAGS_NUMA) { int bits = 8 * futex_size(flags); u64 max = ~0ULL; max >>= 64 - bits; if (nr_node_ids >= max) return false; } return true; } static inline bool futex_validate_input(unsigned int flags, u64 val) { int bits = 8 * futex_size(flags); if (bits < 64 && (val >> bits)) return false; return true; } #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else static inline bool should_fail_futex(bool fshared) { return false; } #endif /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task * waiting on a futex. */ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; struct futex_private_hash *priv; } ____cacheline_aligned_in_smp; /* * Priority Inheritance state: */ struct futex_pi_state { /* * list of 'owned' pi_state instances - these have to be * cleaned up in do_exit() if the task exits prematurely: */ struct list_head list; /* * The PI object: */ struct rt_mutex_base pi_mutex; struct task_struct *owner; refcount_t refcount; union futex_key key; } __randomize_layout; struct futex_q; typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); /** * struct futex_q - The hashed futex queue entry, one per waiting task * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @wake: the wake handler for this queue * @wake_data: data associated with the wake handler * @key: the key the futex is hashed on * @pi_state: optional priority inheritance state * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via * the rt_mutex code. See futex_unqueue_pi(). */ struct futex_q { struct plist_node list; struct task_struct *task; spinlock_t *lock_ptr; futex_wake_fn *wake; void *wake_data; union futex_key key; struct futex_pi_state *pi_state; struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; bool drop_hb_ref; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif } __randomize_layout; extern const struct futex_q futex_q_init; enum futex_access { FUTEX_READ, FUTEX_WRITE }; extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern void futex_q_lockptr_lock(struct futex_q *q); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); extern struct futex_hash_bucket *futex_hash(union futex_key *key); #ifdef CONFIG_FUTEX_PRIVATE_HASH extern void futex_hash_get(struct futex_hash_bucket *hb); extern void futex_hash_put(struct futex_hash_bucket *hb); extern struct futex_private_hash *futex_private_hash(void); extern void futex_private_hash_put(struct futex_private_hash *fph); #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static inline void futex_hash_get(struct futex_hash_bucket *hb) { } static inline void futex_hash_put(struct futex_hash_bucket *hb) { } static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } static inline void futex_private_hash_put(struct futex_private_hash *fph) { } #endif DEFINE_CLASS(hb, struct futex_hash_bucket *, if (_T) futex_hash_put(_T), futex_hash(key), union futex_key *key); DEFINE_CLASS(private_hash, struct futex_private_hash *, if (_T) futex_private_hash_put(_T), futex_private_hash(), void); /** * futex_match - Check whether two futex keys are equal * @key1: Pointer to key1 * @key2: Pointer to key2 * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int futex_match(union futex_key *key1, union futex_key *key2) { return (key1 && key2 && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, union futex_key *key2, struct task_struct *task); extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout); extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); extern int fault_in_user_writeable(u32 __user *uaddr); extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; } /* Read from user memory with pagefaults disabled */ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { guard(pagefault)(); return get_user_inline(*dest, from); } extern void __futex_unqueue(struct futex_q *q); extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket * @task: Task queueing this futex * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The * exceptions involve the PI related operations, which may use futex_unqueue_pi() * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). * * Note that @task may be NULL, for async usage of futexes. */ static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task) __releases(&hb->lock) { __futex_queue(q, hb, task); spin_unlock(&hb->lock); } extern void futex_unqueue_pi(struct futex_q *q); extern void wait_for_owner_exiting(int ret, struct task_struct *exiting); /* * Reflects a new waiter being added to the waitqueue. */ static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_inc(&hb->waiters); /* * Full barrier (A), see the ordering comment above. */ smp_mb__after_atomic(); #endif } /* * Reflects a waiter being removed from the waitqueue by wakeup * paths. */ static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_dec(&hb->waiters); #endif } static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP /* * Full barrier (B), see the ordering comment above. */ smp_mb(); return atomic_read(&hb->waiters); #else return 1; #endif } extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); extern void futex_q_unlock(struct futex_hash_bucket *hb); extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, struct task_struct *task, struct task_struct **exiting, int set_waiters); extern int refill_pi_state_cache(void); extern void get_pi_state(struct futex_pi_state *pi_state); extern void put_pi_state(struct futex_pi_state *pi_state); extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); /* * Express the locking dependencies for lockdep: */ static inline void double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { if (hb1 > hb2) swap(hb1, hb2); spin_lock(&hb1->lock); if (hb1 != hb2) spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); } static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); } /* syscalls */ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2); extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1, u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset); extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset); /** * struct futex_vector - Auxiliary struct for futex_waitv() * @w: Userspace provided data * @q: Kernel side data * * Struct used to build an array with all data need for futex_waitv() */ struct futex_vector { struct futex_waitv w; struct futex_q q; }; extern int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data); extern int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken); extern int futex_unqueue_multiple(struct futex_vector *v, int count); extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to); extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); #endif /* _FUTEX_H */
4 4 4 4 1 4 2 2 17 17 16 1 17 8 1 7 3 4 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ #define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ #define HTYPE hash_ipportip /* IPv4 variant */ /* Member elements */ struct hash_ipportip4_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; }; static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { next->ip = d->ip; next->port = d->port; } /* Common functions */ #define MTYPE hash_ipportip4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipportip4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipportip6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportip6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportip_init(void) { return ip_set_type_register(&hash_ipportip_type); } static void __exit hash_ipportip_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } module_init(hash_ipportip_init); module_exit(hash_ipportip_fini);
13541 75 790 737 70 13 14456 49 522 13541 790 13 14466 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* I/O iterator iteration building functions. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_IOV_ITER_H #define _LINUX_IOV_ITER_H #include <linux/uio.h> #include <linux/bvec.h> #include <linux/folio_queue.h> typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len, void *priv, void *priv2); /* * Handle ITER_UBUF. */ static __always_inline size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { void __user *base = iter->ubuf; size_t progress = 0, remain; remain = step(base + iter->iov_offset, 0, len, priv, priv2); progress = len - remain; iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_IOVEC. */ static __always_inline size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { const struct iovec *p = iter->__iov; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->__iov; iter->__iov = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_KVEC. */ static __always_inline size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct kvec *p = iter->kvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->kvec; iter->kvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_BVEC. */ static __always_inline size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct bio_vec *p = iter->bvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t offset = p->bv_offset + skip, part; void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE); part = min3(len, (size_t)(p->bv_len - skip), (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2); kunmap_local(kaddr); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; if (skip >= p->bv_len) { skip = 0; p++; } if (remain) break; } while (len); iter->nr_segs -= p - iter->bvec; iter->bvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_FOLIOQ. */ static __always_inline size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct folio_queue *folioq = iter->folioq; unsigned int slot = iter->folioq_slot; size_t progress = 0, skip = iter->iov_offset; if (slot == folioq_nr_slots(folioq)) { /* The iterator may have been extended. */ folioq = folioq->next; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t part, remain = 0, consumed; size_t fsize; void *base; if (!folio) break; fsize = folioq_folio_size(folioq, slot); if (skip < fsize) { base = kmap_local_folio(folio, skip); part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; } if (skip >= fsize) { skip = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } if (remain) break; } while (len); iter->folioq_slot = slot; iter->folioq = folioq; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_XARRAY. */ static __always_inline size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { struct folio *folio; size_t progress = 0; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; XA_STATE(xas, iter->xarray, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { size_t remain, consumed, offset, part, flen; if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start + progress); flen = min(folio_size(folio) - offset, len); while (flen) { void *base = kmap_local_folio(folio, offset); part = min_t(size_t, flen, PAGE_SIZE - offset_in_page(offset)); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; progress += consumed; len -= consumed; if (remain || len == 0) goto out; flen -= consumed; offset += consumed; } } out: rcu_read_unlock(); iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_DISCARD. */ static __always_inline size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { size_t progress = len; iter->count -= progress; return progress; } /** * iterate_and_advance2 - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * Two step functions, @step and @ustep, must be provided, one for handling * mapped kernel addresses and the other is given user addresses which have the * potential to fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f ustep, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (likely(iter_is_ubuf(iter))) return iterate_ubuf(iter, len, priv, priv2, ustep); if (likely(iter_is_iovec(iter))) return iterate_iovec(iter, len, priv, priv2, ustep); if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } /** * iterate_and_advance - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * As iterate_and_advance2(), but priv2 is always NULL. */ static __always_inline size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, iov_ustep_f ustep, iov_step_f step) { return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } /** * iterate_and_advance_kernel - Iterate over a kernel-internal iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type * iterators; it will not handle UBUF or IOVEC-type iterators. * * A step functions, @step, must be provided, one for handling mapped kernel * addresses and the other is given user addresses which have the potential to * fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } #endif /* _LINUX_IOV_ITER_H */
50 1338 277 277 4 4 14 14 5 5 498 500 234 236 235 236 15 15 15 15 15 15 12132 1094 12039 20 21 21 20 21 21 21 20 21 4161 4154 4148 4152 4156 4167 4167 4156 1726 4077 4155 4161 4150 1623 1622 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will * always remain a kthread. For kthreads p->worker_private always * points to a struct kthread. For tasks that are not kthreads * p->worker_private is used to point to other things. * * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } scoped_guard (raw_spinlock_irqsave, &p->pi_lock) set_cpus_allowed_force(p, mask); /* It's safe because the task is inactive. */ p->flags |= PF_NO_SETAFFINITY; } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); scoped_guard (raw_spinlock_irqsave, &p->pi_lock) set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = timer_container_of(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); WARN_ON_ONCE(!mm->user_ns); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 // SPDX-License-Identifier: GPL-2.0 /* * Shared application/kernel submission and completion ring pairs, for * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between * the application and kernel side. * * After the application reads the CQ ring tail, it must use an * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses * before writing the tail (using smp_load_acquire to read the tail will * do). It also needs a smp_mb() before updating CQ head (ordering the * entry load(s) with the head store), pairing with an implicit barrier * through a control-dependency in io_get_cqe (smp_store_release to * store head will do). Failure to do so could lead to reading invalid * CQ entries. * * Likewise, the application must use an appropriate smp_wmb() before * writing the SQ tail (ordering SQ entry stores with the tail store), * which pairs with smp_load_acquire in io_get_sqring (smp_store_release * to store the tail will do). And it needs a barrier ordering the SQ * head load before writing new SQ entries (smp_load_acquire to read * head will do). * * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* * updating the SQ tail; a full memory barrier smp_mb() is needed * between. * * Also see the examples in the liburing library: * * git://git.kernel.org/pub/scm/linux/kernel/git/axboe/liburing.git * * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens * from data shared between the kernel and application. This is done both * for ordering purposes, but also to ensure that once a value is loaded from * data that the application could potentially modify, it remains stable. * * Copyright (C) 2018-2019 Jens Axboe * Copyright (c) 2018-2019 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <net/compat.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/bits.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> #include <linux/io_uring.h> #include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> #include <linux/jump_label.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io-wq.h" #include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" #include "tctx.h" #include "register.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" #include "rsrc.h" #include "cancel.h" #include "net.h" #include "notif.h" #include "waitid.h" #include "futex.h" #include "napi.h" #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" #include "eventfd.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ REQ_F_REISSUE | REQ_F_POLLED | \ IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 #define IO_LOCAL_TW_DEFAULT_MAX 20 /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) /* * No waiters. It's larger than any valid value of the tw counter * so that tests against ->cq_wait_nr would fail and skip wake_up(). */ #define IO_CQ_WAKE_INIT (-1U) /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL static const struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, .maxlen = sizeof(sysctl_io_uring_disabled), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "io_uring_group", .data = &sysctl_io_uring_group, .maxlen = sizeof(gid_t), .mode = 0644, .proc_handler = proc_dointvec, }, }; #endif static void io_poison_cached_req(struct io_kiocb *req) { req->ctx = IO_URING_PTR_POISON; req->tctx = IO_URING_PTR_POISON; req->file = IO_URING_PTR_POISON; req->creds = IO_URING_PTR_POISON; req->io_task_work.func = IO_URING_PTR_POISON; req->apoll = IO_URING_PTR_POISON; } static void io_poison_req(struct io_kiocb *req) { io_poison_cached_req(req); req->async_data = IO_URING_PTR_POISON; req->kbuf = IO_URING_PTR_POISON; req->comp_list.next = IO_URING_PTR_POISON; req->file_node = IO_URING_PTR_POISON; req->link = IO_URING_PTR_POISON; } static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) { return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); } static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); io_req_set_res(req, res, 0); } static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { if (IS_ENABLED(CONFIG_KASAN)) io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); complete(&ctx->ref_comp); } /* * Terminate the request if either of these conditions are true: * * 1) It's being executed by the original task, but that task is marked * with PF_EXITING as it's exiting. * 2) PF_KTHREAD is set, in which case the invoker of the task_work is * our fallback task_work. * 3) The ring has been closed and is going away. */ static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) { return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); } static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; struct io_tw_state ts = {}; percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); ts.cancel = io_should_terminate_tw(ctx); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func((struct io_tw_req){req}, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { unsigned int hash_buckets; int i; do { hash_buckets = 1U << bits; table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]), GFP_KERNEL_ACCOUNT); if (table->hbs) break; if (bits == 1) return -ENOMEM; bits--; } while (1); table->hash_bits = bits; for (i = 0; i < hash_buckets; i++) INIT_HLIST_HEAD(&table->hbs[i].list); return 0; } static void io_free_alloc_caches(struct io_ring_ctx *ctx) { io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; bool ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; xa_init(&ctx->io_bl_xa); /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread, but * don't keep too many buckets to not overconsume memory. */ hash_bits = ilog2(p->cq_entries) - 5; hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; ctx->hybrid_poll_time = LLONG_MAX; atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr), offsetof(struct io_async_msghdr, clear)); ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw), offsetof(struct io_async_rw, clear)); ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) goto free_ref; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); mutex_init(&ctx->tctx_lock); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); mutex_init(&ctx->mmap_lock); return ctx; free_ref: percpu_ref_exit(&ctx->refs); err: io_free_alloc_caches(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) io_kbuf_drop_legacy(req); if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; if (def->cleanup) def->cleanup(req); } if (req->flags & REQ_F_INFLIGHT) atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { kfree(req->async_data); req->async_data = NULL; } req->flags &= ~IO_REQ_CLEAN_FLAGS; } /* * Mark the request as inflight, so that file cancelation will find it. * Can be used if the file is an io_uring instance, or if the request itself * relies on ->mm being alive for the duration of the request. */ inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; atomic_inc(&req->tctx->inflight_tracked); } } static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) return NULL; req->flags &= ~REQ_F_ARM_LTIMEOUT; req->flags |= REQ_F_LINK_TIMEOUT; /* linked timeouts should have two refs once prep'ed */ io_req_set_refcount(req); __io_req_set_refcount(req->link, 2); return req->link; } static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; req->creds = get_current_cred(); } req->work.list.next = NULL; atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); if (req->file && (req->flags & REQ_F_ISREG)) { bool should_hash = def->hash_reg_file; /* don't serialize this request if the fs doesn't need it */ if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } static void io_prep_async_link(struct io_kiocb *req) { struct io_kiocb *cur; if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); } } static void io_queue_iowq(struct io_kiocb *req) { struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { io_req_task_queue_fail(req, -ECANCELED); return; } /* init ->work of the whole link before punting */ io_prep_async_link(req); /* * Not expected to happen, but if we do have a bug where this _can_ * happen, catch it here and ensure the request is marked as * canceled. That will make io-wq go through the usual work cancel * procedure rather than attempt to run this request (or create a new * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(tctx->task, current))) atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); } static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) { io_queue_iowq(tw_req.req); } void io_req_queue_iowq(struct io_kiocb *req) { req->io_task_work.func = io_req_queue_iowq_tw; io_req_task_work_add(req); } unsigned io_linked_nr(struct io_kiocb *req) { struct io_kiocb *tmp; unsigned nr = 0; io_for_each_link(tmp, req) nr++; return nr; } static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { bool drain_seen = false, first = true; lockdep_assert_held(&ctx->uring_lock); __io_req_caches_free(ctx); while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); drain_seen |= de->req->flags & REQ_F_IO_DRAIN; if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) return; list_del_init(&de->list); ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); first = false; } } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->has_evfd) io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { if (!ctx->lockless_cq) spin_lock(&ctx->completion_lock); } static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { spin_lock(&ctx->completion_lock); } static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); if (!ctx->task_complete) { if (!ctx->lockless_cq) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ if (!ctx->syscall_iopoll) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); } static void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_wake(ctx); io_commit_cqring_flush(ctx); } static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (ocqe->cqe.flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { is_cqe32 = true; cqe_size <<= 1; } if (ctx->flags & IORING_SETUP_CQE32) is_cqe32 = false; if (!dying) { if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } list_del(&ocqe->list); kfree(ocqe); /* * For silly syzbot cases that deliberately overflow by huge * amounts, check if we need to resched and drop and * reacquire the locks if so. Nothing real would ever hit this. * Ideally we'd have a non-posting unlock for this, but hard * to care for a non-real case. */ if (need_resched()) { ctx->cqe_sentinel = ctx->cqe_cached; io_cq_unlock_post(ctx); mutex_unlock(&ctx->uring_lock); cond_resched(); mutex_lock(&ctx->uring_lock); io_cq_lock(ctx); } } if (list_empty(&ctx->cq_overflow_list)) { clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } io_cq_unlock_post(ctx); } static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { if (ctx->rings) __io_cqring_overflow_flush(ctx, true); } static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); __io_cqring_overflow_flush(ctx, false); mutex_unlock(&ctx->uring_lock); } /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct io_kiocb *req) { struct io_uring_task *tctx = req->tctx; if (likely(tctx->task == current)) { tctx->cached_refs++; } else { percpu_counter_sub(&tctx->inflight, 1); if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); put_task_struct(tctx->task); } } void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; percpu_counter_add(&tctx->inflight, refill); refcount_add(refill, &current->usage); tctx->cached_refs += refill; } __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; if (refs) { tctx->cached_refs = 0; percpu_counter_sub(&tctx->inflight, refs); put_task_struct_many(task, refs); } } static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, struct io_overflow_cqe *ocqe) { lockdep_assert_held(&ctx->completion_lock); if (!ocqe) { struct io_rings *r = ctx->rings; /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } if (list_empty(&ctx->cq_overflow_list)) { set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, struct io_cqe *cqe, struct io_big_cqe *big_cqe, gfp_t gfp) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); bool is_cqe32 = false; if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { is_cqe32 = true; ocq_size += sizeof(struct io_uring_cqe); } ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); if (ocqe) { ocqe->cqe.user_data = cqe->user_data; ocqe->cqe.res = cqe->res; ocqe->cqe.flags = cqe->flags; if (is_cqe32 && big_cqe) { ocqe->cqe.big_cqe[0] = big_cqe->extra1; ocqe->cqe.big_cqe[1] = big_cqe->extra2; } } if (big_cqe) big_cqe->extra1 = big_cqe->extra2 = 0; return ocqe; } /* * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE * because the ring is a single 16b entry away from wrapping. */ static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) { if (__io_cqring_events(ctx) < ctx->cq_entries) { struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; cqe->user_data = 0; cqe->res = 0; cqe->flags = IORING_CQE_F_SKIP; ctx->cached_cq_tail++; return true; } return false; } /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); unsigned int free, queued, len; /* * Posting into the CQ when there are pending overflowed CQEs may break * ordering guarantees, which will affect links, F_MORE users and more. * Force overflow the completion. */ if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; /* * Post dummy CQE if a 32b CQE is needed and there's only room for a * 16b CQE before the ring wraps. */ if (cqe32 && off + 1 == ctx->cq_entries) { if (!io_fill_nop_cqe(ctx, off)) return false; off = 0; } /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { off <<= 1; len <<= 1; } ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; return true; } static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, struct io_uring_cqe src_cqe[2]) { struct io_uring_cqe *cqe; if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) return false; if (unlikely(!io_get_cqe(ctx, &cqe, true))) return false; memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); trace_io_uring_complete(ctx, NULL, cqe); return true; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } trace_io_uring_complete(ctx, NULL, cqe); return true; } return false; } static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) { return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; } static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, struct io_big_cqe *big_cqe) { struct io_overflow_cqe *ocqe; ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); spin_lock(&ctx->completion_lock); io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); } static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, struct io_cqe *cqe, struct io_big_cqe *big_cqe) { struct io_overflow_cqe *ocqe; ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_NOWAIT); return io_cqring_add_overflow(ctx, ocqe); } bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (unlikely(!filled)) { struct io_cqe cqe = io_init_cqe(user_data, res, cflags); filled = io_cqe_overflow_locked(ctx, &cqe, NULL); } io_cq_unlock_post(ctx); return filled; } /* * Must be called from inline task_work so we know a flush will happen later, * and obviously with ctx->uring_lock held (tw always has that). */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { lockdep_assert_held(&ctx->uring_lock); lockdep_assert(ctx->lockless_cq); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { struct io_cqe cqe = io_init_cqe(user_data, res, cflags); io_cqe_overflow(ctx, &cqe, NULL); } ctx->submit_state.cq_flush = true; } /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; bool posted; /* * If multishot has already posted deferred completions, ensure that * those are flushed first before posting this one. If not, CQEs * could get reordered. */ if (!wq_list_empty(&ctx->submit_state.compl_reqs)) __io_submit_flush_completions(ctx); lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); if (!ctx->lockless_cq) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); spin_unlock(&ctx->completion_lock); } else { posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); } ctx->submit_state.cq_flush = true; return posted; } /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) { struct io_ring_ctx *ctx = req->ctx; bool posted; lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); cqe[0].user_data = req->cqe.user_data; if (!ctx->lockless_cq) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux32(ctx, cqe); spin_unlock(&ctx->completion_lock); } else { posted = io_fill_cqe_aux32(ctx, cqe); } ctx->submit_state.cq_flush = true; return posted; } static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; bool completed = true; /* * All execution paths but io-wq use the deferred completions by * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here. */ if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ))) return; /* * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; } io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) completed = io_fill_cqe_req(ctx, req); io_cq_unlock_post(ctx); if (!completed) goto defer_complete; /* * We don't free the request here because we know it's called from * io-wq only, which holds a reference, so it cannot be the last put. */ req_ref_put(req); } void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); } /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; int ret; ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, * retry single alloc to be on the safe side. */ if (unlikely(ret <= 0)) { reqs[0] = kmem_cache_alloc(req_cachep, gfp); if (!reqs[0]) return false; ret = 1; } percpu_ref_get_many(&ctx->refs, ret); ctx->nr_req_allocated += ret; while (ret--) { struct io_kiocb *req = reqs[ret]; io_req_add_to_cache(req, ctx); } return true; } __cold void io_free_req(struct io_kiocb *req) { /* refs were already put, restore them for io_req_task_complete() */ req->flags &= ~REQ_F_REFCOUNT; /* we only want to free it, don't post CQEs */ req->flags |= REQ_F_CQE_SKIP; req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); io_disarm_next(req); spin_unlock(&ctx->completion_lock); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ if (unlikely(req->flags & IO_DISARM_MASK)) __io_req_find_next_prep(req); nxt = req->link; req->link = NULL; return nxt; } static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } /* * Run queued task_work, returning the number of entries processed in *count. * If more entries than max_entries are available, stop processing once this * is reached and return the rest of the list. */ struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries) { struct io_ring_ctx *ctx = NULL; struct io_tw_state ts = { }; do { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); if (req->ctx != ctx) { ctx_flush_and_put(ctx, ts); ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); ts.cancel = io_should_terminate_tw(ctx); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, (struct io_tw_req){req}, ts); node = next; (*count)++; if (unlikely(need_resched())) { ctx_flush_and_put(ctx, ts); ctx = NULL; cond_resched(); } } while (node && *count < max_entries); ctx_flush_and_put(ctx, ts); return node; } static __cold void __io_fallback_tw(struct llist_node *node, bool sync) { struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; if (last_ctx != req->ctx) { if (last_ctx) { if (sync) flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } last_ctx = req->ctx; percpu_ref_get(&last_ctx->refs); } if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) schedule_delayed_work(&last_ctx->fallback_work, 1); } if (last_ctx) { if (sync) flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } } static void io_fallback_tw(struct io_uring_task *tctx, bool sync) { struct llist_node *node = llist_del_all(&tctx->task_list); __io_fallback_tw(node, sync); } struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count) { struct llist_node *node; node = llist_del_all(&tctx->task_list); if (node) { node = llist_reverse_order(node); node = io_handle_tw_list(node, count, max_entries); } /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); trace_io_uring_task_work_run(tctx, *count); return node; } void tctx_task_work(struct callback_head *cb) { struct io_uring_task *tctx; struct llist_node *ret; unsigned int count = 0; tctx = container_of(cb, struct io_uring_task, task_work); ret = tctx_task_work_run(tctx, UINT_MAX, &count); /* can't happen */ WARN_ON_ONCE(ret); } static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; /* See comment above IO_CQ_WAKE_INIT */ BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); /* * We don't know how many requests there are in the link and whether * they can even be queued lazily, fall back to non-lazy. */ if (req->flags & IO_REQ_LINK_FLAGS) flags &= ~IOU_F_TWQ_LAZY_WAKE; guard(rcu)(); head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; if (head) { struct io_kiocb *first_req = container_of(head, struct io_kiocb, io_task_work.node); /* * Might be executed at any moment, rely on * SLAB_TYPESAFE_BY_RCU to keep it alive. */ nr_tw_prev = READ_ONCE(first_req->nr_tw); } /* * Theoretically, it can overflow, but that's fine as one of * previous adds should've tried to wake the task. */ nr_tw = nr_tw_prev + 1; if (!(flags & IOU_F_TWQ_LAZY_WAKE)) nr_tw = IO_CQ_WAKE_FORCE; req->nr_tw = nr_tw; req->io_task_work.node.next = head; } while (!try_cmpxchg(&ctx->work_llist.first, &head, &req->io_task_work.node)); /* * cmpxchg implies a full barrier, which pairs with the barrier * in set_current_state() on the io_cqring_wait() side. It's used * to ensure that either we see updated ->cq_wait_nr, or waiters * going to sleep will observe the work added to the list, which * is similar to the wait/wawke task state sync. */ if (!head) { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) io_eventfd_signal(ctx, false); } nr_wait = atomic_read(&ctx->cq_wait_nr); /* not enough or no one is waiting */ if (nr_tw < nr_wait) return; /* the previous add has already woken it up */ if (nr_tw_prev >= nr_wait) return; wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } static void io_req_normal_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->tctx; struct io_ring_ctx *ctx = req->ctx; /* task_work already pending, we're done */ if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); /* SQPOLL doesn't need the task_work added, it'll run it itself */ if (ctx->flags & IORING_SETUP_SQPOLL) { __set_notify_signal(tctx->task); return; } if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) return; io_fallback_tw(tctx, false); } void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_req_local_work_add(req, flags); else io_req_normal_work_add(req); } void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) { if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) return; __io_req_task_work_add(req, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { struct llist_node *node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); node = llist_del_all(&ctx->retry_llist); __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, int min_events) { if (!io_local_work_pending(ctx)) return false; if (events < min_events) return true; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); return false; } static int __io_run_local_work_loop(struct llist_node **node, io_tw_token_t tw, int events) { int ret = 0; while (*node) { struct llist_node *next = (*node)->next; struct io_kiocb *req = container_of(*node, struct io_kiocb, io_task_work.node); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, (struct io_tw_req){req}, tw); *node = next; if (++ret >= events) break; } return ret; } static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, int min_events, int max_events) { struct llist_node *node; unsigned int loops = 0; int ret = 0; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: tw.cancel = io_should_terminate_tw(ctx); min_events -= ret; ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); if (ctx->retry_llist.first) goto retry_done; /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); ret += __io_run_local_work_loop(&node, tw, max_events - ret); ctx->retry_llist.first = node; loops++; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; retry_done: io_submit_flush_completions(ctx); if (io_run_local_work_continue(ctx, ret, min_events)) goto again; trace_io_uring_local_work_run(ctx, ret, loops); return ret; } static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) { struct io_tw_state ts = {}; if (!io_local_work_pending(ctx)) return 0; return __io_run_local_work(ctx, ts, min_events, max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) { struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); ret = __io_run_local_work(ctx, ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_kiocb *req = tw_req.req; io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); if (unlikely(tw.cancel)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); else io_queue_sqe(req, 0); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) { io_req_set_res(req, ret, 0); req->io_task_work.func = io_req_task_cancel; io_req_task_work_add(req); } void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; io_req_task_work_add(req); } void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); if (nxt) io_req_task_queue(nxt); } static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { if (req->file_node) { io_put_rsrc_node(req->ctx, req->file_node); req->file_node = NULL; } if (req->flags & REQ_F_BUF_NODE) io_put_rsrc_node(req->ctx, req->buf_node); } static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { do { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { if (req->flags & REQ_F_REISSUE) { node = req->comp_list.next; req->flags &= ~REQ_F_REISSUE; io_queue_iowq(req); continue; } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) continue; } if ((req->flags & REQ_F_POLLED) && req->apoll) { struct async_poll *apoll = req->apoll; if (apoll->double_poll) kfree(apoll->double_poll); io_cache_free(&ctx->apoll_cache, apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) io_queue_next(req); if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); } io_put_file(req); io_req_put_rsrc_nodes(req); io_put_task(req); node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); } void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; struct io_wq_work_node *node; __io_cq_lock(ctx); __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); /* * Requests marked with REQUEUE should not post a CQE, they * will go through the io-wq retry machinery and post one * later. */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); else io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } if (unlikely(ctx->drain_active)) io_queue_deferred(ctx); ctx->submit_state.cq_flush = false; } static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ smp_rmb(); return __io_cqring_events(ctx); } /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); while (!wq_list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ if (io_do_iopoll(ctx, true) == 0) break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. * Also let task_work, etc. to progress by releasing the mutex */ if (need_resched()) { mutex_unlock(&ctx->uring_lock); cond_resched(); mutex_lock(&ctx->uring_lock); } } mutex_unlock(&ctx->uring_lock); if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); } static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { unsigned int nr_events = 0; unsigned long check_cq; min_events = min(min_events, ctx->cq_entries); lockdep_assert_held(&ctx->uring_lock); if (!io_allowed_run_tw(ctx)) return -EEXIST; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) __io_cqring_overflow_flush(ctx, false); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. */ if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) return -EBADR; } /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ if (io_cqring_events(ctx)) return 0; do { int ret = 0; /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets * issued. That app will hold the uring_lock for the duration * of the poll right here, so we need to take a breather every * now and then to ensure that the issue has a chance to add * the poll to the issued list. Otherwise we can spin here * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ if (wq_list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); } /* some requests don't go through iopoll_list */ if (tail != ctx->cached_cq_tail || wq_list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, !min_events); if (unlikely(ret < 0)) return ret; if (task_sigpending(current)) return -EINTR; if (need_resched()) break; nr_events += ret; } while (nr_events < min_events); return 0; } void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) { io_req_complete_defer(tw_req.req); } /* * After the iocb has been issued, it's safe to be found on the poll list. * Adding the kiocb to the list AFTER submission ensures that we don't * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; /* workqueue context doesn't hold uring_lock, grab it now */ if (unlikely(needs_lock)) mutex_lock(&ctx->uring_lock); /* * Track whether we have multiple files in our lists. This will impact * how we do polling eventually, not spinning if we're on potentially * different devices. */ if (wq_list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, comp_list); if (list_req->file != req->file) ctx->poll_multi_queue = true; } /* * For fast devices, IO may have already completed. If it has, add * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) wq_list_add_head(&req->comp_list, &ctx->iopoll_list); else wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); if (unlikely(needs_lock)) { /* * If IORING_SETUP_SQPOLL is enabled, sqes are either handle * in sq thread task context or in io worker task context. If * current task context is sq thread, we don't need to check * whether should wake up sq thread. */ if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); mutex_unlock(&ctx->uring_lock); } } io_req_flags_t io_file_get_flags(struct file *file) { io_req_flags_t res = 0; BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) res |= REQ_F_SUPPORT_NOWAIT; return res; } static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { io_req_defer_failed(req, -ENOMEM); return; } io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; ctx->nr_drained += io_linked_nr(req); list_add_tail(&de->list, &ctx->defer_list); io_queue_deferred(ctx); if (!drain && list_empty(&ctx->defer_list)) ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, unsigned int issue_flags) { if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); else req->file = io_file_get_normal(req, req->cqe.fd); return !!req->file; } #define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) static inline int __io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags, const struct io_issue_def *def) { const struct cred *creds = NULL; struct io_kiocb *link = NULL; int ret; if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) creds = override_creds(req->creds); if (req->flags & REQ_F_ARM_LTIMEOUT) link = __io_prep_linked_timeout(req); } if (!def->audit_skip) audit_uring_entry(req->opcode); ret = def->issue(req, issue_flags); if (!def->audit_skip) audit_uring_exit(!ret, ret); if (unlikely(creds || link)) { if (creds) revert_creds(creds); if (link) io_queue_linked_timeout(link); } return ret; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; int ret; if (unlikely(!io_assign_file(req, def, issue_flags))) return -EBADF; ret = __io_issue_sqe(req, issue_flags, def); if (ret == IOU_COMPLETE) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else io_req_complete_post(req, issue_flags); return 0; } if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); } return ret; } int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { const unsigned int issue_flags = IO_URING_F_NONBLOCK | IO_URING_F_MULTISHOT | IO_URING_F_COMPLETE_DEFER; int ret; io_tw_lock(req->ctx, tw); WARN_ON_ONCE(!req->file); if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EFAULT; ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); return ret; } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; if (req_ref_put_and_test_atomic(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); } return nxt ? &nxt->work : NULL; } void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else req_ref_get(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } /* * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the * submitter task context. Final request completions are handed to the * right context, however this is not the case of auxiliary CQEs, * which is the main mean of operation for multishot requests. * Don't allow any multishot execution from io-wq. It's more restrictive * than necessary and also cleaner. */ if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { err = -EBADFD; if (!io_file_can_poll(req)) goto fail; if (req->file->f_flags & O_NONBLOCK || req->file->f_mode & FMODE_NOWAIT) { err = -ECANCELED; if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) goto fail; return; } else { req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); } } if (req->flags & REQ_F_FORCE_ASYNC) { bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && io_file_can_poll(req)) { needs_poll = true; issue_flags |= IO_URING_F_NONBLOCK; } } do { ret = io_issue_sqe(req, issue_flags); if (ret != -EAGAIN) break; /* * If REQ_F_NOWAIT is set, then don't wait or retry with * poll. -EAGAIN is final for that case. */ if (req->flags & REQ_F_NOWAIT) break; /* * We can get EAGAIN for iopolled IO even though we're * forcing a sync submission from here, since we can't * wait for request slots on the block side. */ if (!needs_poll) { if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) break; if (io_wq_worker_stopped()) break; cond_resched(); continue; } if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) return; /* aborted or ready, in either case retry blocking */ needs_poll = false; issue_flags &= ~IO_URING_F_NONBLOCK; } while (1); /* avoid locking problems by failing it from a clean context */ if (ret) io_req_task_queue_fail(req, ret); } inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *node; struct file *file = NULL; io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (node) { node->refs++; req->file_node = node; req->flags |= io_slot_flags(node); file = io_slot_file(node); } io_ring_submit_unlock(ctx, issue_flags); return file; } struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); trace_io_uring_file_get(req, fd); /* we don't allow fixed io_uring files */ if (file && io_is_uring_fops(file)) io_req_track_inflight(req); return file; } static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; if (req->flags & REQ_F_SQE_COPIED) return 0; req->flags |= REQ_F_SQE_COPIED; if (!def->sqe_copy) return 0; if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE))) return -EFAULT; def->sqe_copy(req); return 0; } static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret) __must_hold(&req->ctx->uring_lock) { if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { fail: io_req_defer_failed(req, ret); return; } ret = io_req_sqe_copy(req, issue_flags); if (unlikely(ret)) goto fail; switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_req_task_queue(req); break; case IO_APOLL_ABORTED: io_queue_iowq(req); break; case IO_APOLL_OK: break; } } static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags) __must_hold(&req->ctx->uring_lock) { unsigned int issue_flags = IO_URING_F_NONBLOCK | IO_URING_F_COMPLETE_DEFER | extra_flags; int ret; ret = io_issue_sqe(req, issue_flags); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (unlikely(ret)) io_queue_async(req, issue_flags, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { if (unlikely(req->flags & REQ_F_FAIL)) { /* * We don't submit, fail them all, for that replace hardlinks * with normal links. Extra REQ_F_LINK is tolerated. */ req->flags &= ~REQ_F_HARDLINK; req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { /* can't fail with IO_URING_F_INLINE */ io_req_sqe_copy(req, IO_URING_F_INLINE); if (unlikely(req->ctx->drain_active)) io_drain_req(req); else io_queue_iowq(req); } } /* * Check SQE restrictions (opcode and flags). * * Returns 'true' if SQE is allowed, 'false' otherwise. */ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; if ((sqe_flags & ctx->restrictions.sqe_flags_required) != ctx->restrictions.sqe_flags_required) return false; if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | ctx->restrictions.sqe_flags_required)) return false; return true; } static void io_init_drain(struct io_ring_ctx *ctx) { struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; if (head) { /* * If we need to drain a request in the middle of a link, drain * the head request and the next request/link after the current * link. Considering sequential execution of links, * REQ_F_IO_DRAIN will be maintained for every request of our * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; ctx->drain_next = true; } } static __cold int io_init_fail_req(struct io_kiocb *req, int err) { /* ensure per-opcode data is cleared if we fail before prep */ memset(&req->cmd.data, 0, sizeof(req->cmd.data)); return err; } static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; req->ctx = ctx; req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); req->flags = (__force io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->tctx = current->io_uring; req->cancel_seq_set = false; req->async_data = NULL; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; return io_init_fail_req(req, -EINVAL); } opcode = array_index_nospec(opcode, IORING_OP_LAST); def = &io_issue_defs[opcode]; if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { /* * A 128b op on a non-128b SQ requires mixed SQE support as * well as 2 contiguous entries. */ if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || !(ctx->cached_sq_head & (ctx->sq_entries - 1))) return io_init_fail_req(req, -EINVAL); /* * A 128b operation on a mixed SQ uses two entries, so we have * to increment the head and cached refs, and decrement what's * left. */ current->io_uring->cached_refs++; ctx->cached_sq_head++; (*left)--; } if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return io_init_fail_req(req, -EINVAL); if (sqe_flags & IOSQE_BUFFER_SELECT) { if (!def->buffer_select) return io_init_fail_req(req, -EOPNOTSUPP); req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) return io_init_fail_req(req, -EOPNOTSUPP); io_init_drain(ctx); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ if (ctx->drain_active) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { ctx->drain_next = false; ctx->drain_active = true; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } if (!def->ioprio && sqe->ioprio) return io_init_fail_req(req, -EINVAL); if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) return io_init_fail_req(req, -EINVAL); if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; req->cqe.fd = READ_ONCE(sqe->fd); /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. */ if (state->need_plug && def->plug) { state->plug_started = true; state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } } personality = READ_ONCE(sqe->personality); if (personality) { int ret; req->creds = xa_load(&ctx->personalities, personality); if (!req->creds) return io_init_fail_req(req, -EINVAL); get_cred(req->creds); ret = security_uring_override_creds(req->creds); if (ret) { put_cred(req->creds); return io_init_fail_req(req, ret); } req->flags |= REQ_F_CREDS; } return def->prep(req, sqe); } static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, struct io_kiocb *req, int ret) { struct io_ring_ctx *ctx = req->ctx; struct io_submit_link *link = &ctx->submit_state.link; struct io_kiocb *head = link->head; trace_io_uring_req_failed(sqe, req, ret); /* * Avoid breaking links in the middle as it renders links with SQPOLL * unusable. Instead of failing eagerly, continue assembling the link if * applicable and mark the head with REQ_F_FAIL. The link flushing code * should find the flag and handle the rest. */ req_fail_link_node(req, ret); if (head && !(head->flags & REQ_F_FAIL)) req_fail_link_node(head, -ECANCELED); if (!(req->flags & IO_REQ_LINK_FLAGS)) { if (head) { link->last->link = req; link->head = NULL; req = head; } io_queue_sqe_fallback(req); return ret; } if (head) link->last->link = req; else link->head = req; link->last = req; return 0; } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; ret = io_init_req(ctx, req, sqe, left); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); trace_io_uring_submit_req(req); /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { trace_io_uring_link(req, link->last); io_req_sqe_copy(req, IO_URING_F_INLINE); link->last->link = req; link->last = req; if (req->flags & IO_REQ_LINK_FLAGS) return 0; /* last request of the link, flush it */ req = link->head; link->head = NULL; if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) goto fallback; } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { if (req->flags & IO_REQ_LINK_FLAGS) { link->head = req; link->last = req; } else { fallback: io_queue_sqe_fallback(req); } return 0; } io_queue_sqe(req, IO_URING_F_INLINE); return 0; } /* * Batched submission is done, ensure local IO is flushed out. */ static void io_submit_state_end(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; if (unlikely(state->link.head)) io_queue_sqe_fallback(state->link.head); /* flush only after queuing links as they can generate completions */ io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); } /* * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { state->plug_started = false; state->need_plug = max_ios > 2; state->submit_nr = max_ios; /* set only head, no need to init link_last in advance */ state->link.head = NULL; } static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; /* * Ensure any loads from the SQEs are done at this point, * since once we write the new head, the application could * write new data to them. */ smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* * Fetch an sqe, if one is available. Note this returns a pointer to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; if (static_branch_unlikely(&io_key_has_sqarray) && (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; } head = array_index_nospec(head, ctx->sq_entries); } /* * The cached sq head (or cq tail) serves two purposes: * * 1) allows us to batch the cost of updating the user visible * head updates. * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1; *sqe = &ctx->sq_sqes[head]; return true; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); unsigned int left; int ret; entries = min(nr, entries); if (unlikely(!entries)) return 0; ret = left = entries; io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); do { const struct io_uring_sqe *sqe; struct io_kiocb *req; if (unlikely(!io_alloc_req(ctx, &req))) break; if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } /* * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; } } while (--left); if (unlikely(left)) { ret -= left; /* try again if it submitted nothing and can't allocate a req */ if (!ret && io_req_cache_empty(ctx)) ret = -EAGAIN; current->io_uring->cached_refs += left; } io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); return ret; } static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) return 0; if (task_sigpending(current)) return -EINTR; return 0; } static bool current_pending_io(void) { struct io_uring_task *tctx = current->io_uring; if (!tctx) return false; return percpu_counter_read_positive(&tctx->inflight); } static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) { struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); WRITE_ONCE(iowq->hit_timeout, 1); iowq->min_timeout = 0; wake_up_process(iowq->wq.private); return HRTIMER_NORESTART; } /* * Doing min_timeout portion. If we saw any timeouts, events, or have work, * wake up. If not, and we have a normal timeout, switch to that and keep * sleeping. */ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) { struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); struct io_ring_ctx *ctx = iowq->ctx; /* no general timeout, or shorter (or equal), we are done */ if (iowq->timeout == KTIME_MAX || ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) goto out_wake; /* work we may need to run, wake function will see if we need to wake */ if (io_has_work(ctx)) goto out_wake; /* got events since we started waiting, min timeout is done */ if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) goto out_wake; /* if we have any events and min timeout expired, we're done */ if (io_cqring_events(ctx)) goto out_wake; /* * If using deferred task_work running and application is waiting on * more than one request, ensure we reset it now where we are switching * to normal sleeps. Any request completion post min_wait should wake * the task and return. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 1); smp_mb(); if (!llist_empty(&ctx->work_llist)) goto out_wake; } /* any generated CQE posted past this time should wake us up */ iowq->cq_tail = iowq->cq_min_tail; hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); hrtimer_set_expires(timer, iowq->timeout); return HRTIMER_RESTART; out_wake: return io_cqring_timer_wakeup(timer); } static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, clockid_t clock_id, ktime_t start_time) { ktime_t timeout; if (iowq->min_timeout) { timeout = ktime_add_ns(iowq->min_timeout, start_time); hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, HRTIMER_MODE_ABS); } else { timeout = iowq->timeout; hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, HRTIMER_MODE_ABS); } hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); if (!READ_ONCE(iowq->hit_timeout)) schedule(); hrtimer_cancel(&iowq->t); destroy_hrtimer_on_stack(&iowq->t); __set_current_state(TASK_RUNNING); return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; } struct ext_arg { size_t argsz; struct timespec64 ts; const sigset_t __user *sig; ktime_t min_time; bool ts_set; bool iowait; }; static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct ext_arg *ext_arg, ktime_t start_time) { int ret = 0; /* * Mark us as being in io_wait if we have pending requests, so cpufreq * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ if (ext_arg->iowait && current_pending_io()) current->in_iowait = 1; if (iowq->timeout != KTIME_MAX || iowq->min_timeout) ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); else schedule(); current->in_iowait = 0; return ret; } /* If this returns > 0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct ext_arg *ext_arg, ktime_t start_time) { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; if (unlikely(task_sigpending(current))) return -EINTR; if (unlikely(io_should_wake(iowq))) return 0; return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); } /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; ktime_t start_time; int ret; min_events = min_t(int, min_events, ctx->cq_entries); if (!io_allowed_run_tw(ctx)) return -EEXIST; if (io_local_work_pending(ctx)) io_run_local_work(ctx, min_events, max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) io_cqring_do_overflow_flush(ctx); if (__io_cqring_events_user(ctx) >= min_events) return 0; init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.hit_timeout = 0; iowq.min_timeout = ext_arg->min_time; iowq.timeout = KTIME_MAX; start_time = io_get_time(ctx); if (ext_arg->ts_set) { iowq.timeout = timespec64_to_ktime(ext_arg->ts); if (!(flags & IORING_ENTER_ABS_TIMER)) iowq.timeout = ktime_add(iowq.timeout, start_time); } if (ext_arg->sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, ext_arg->argsz); else #endif ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); if (ret) return ret; } io_napi_busy_loop(ctx, &iowq); trace_io_uring_cqring_wait(ctx, min_events); do { unsigned long check_cq; int nr_wait; /* if min timeout has been hit, don't reset wait count */ if (!iowq.hit_timeout) nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); else nr_wait = 1; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); } ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); /* * Run task_work after scheduling and before io_should_wake(). * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ if (io_local_work_pending(ctx)) io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); /* * Non-local task_work will be run on exit to userspace, but * if we're using DEFER_TASKRUN, then we could have waited * with a timeout for a number of requests. If the timeout * hits, we could have some requests ready to process. Ensure * this break is _after_ we have run task_work, to avoid * deferring running potentially pending requests until the * next time we wait for events. */ if (ret < 0) break; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { /* let the caller flush overflows, retry */ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) io_cqring_do_overflow_flush(ctx); if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { ret = -EBADR; break; } } if (io_should_wake(&iowq)) { ret = 0; break; } cond_resched(); } while (1); if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } static void io_rings_free(struct io_ring_ctx *ctx) { io_free_region(ctx->user, &ctx->sq_region); io_free_region(ctx->user, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } static int rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, struct io_rings_layout *rl) { struct io_rings *rings; size_t sqe_size; size_t off; if (flags & IORING_SETUP_CQE_MIXED) { if (cq_entries < 2) return -EOVERFLOW; } if (flags & IORING_SETUP_SQE_MIXED) { if (sq_entries < 2) return -EOVERFLOW; } rl->sq_array_offset = SIZE_MAX; sqe_size = sizeof(struct io_uring_sqe); if (flags & IORING_SETUP_SQE128) sqe_size *= 2; rl->sq_size = array_size(sqe_size, sq_entries); if (rl->sq_size == SIZE_MAX) return -EOVERFLOW; off = struct_size(rings, cqes, cq_entries); if (flags & IORING_SETUP_CQE32) off = size_mul(off, 2); if (off == SIZE_MAX) return -EOVERFLOW; #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) return -EOVERFLOW; #endif if (!(flags & IORING_SETUP_NO_SQARRAY)) { size_t sq_array_size; rl->sq_array_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); off = size_add(off, sq_array_size); if (off == SIZE_MAX) return -EOVERFLOW; } rl->rings_size = off; return 0; } static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } if (nr) { ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); } } static __cold void io_req_caches_free(struct io_ring_ctx *ctx) { guard(mutex)(&ctx->uring_lock); __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); io_destroy_buffers(ctx); io_free_region(ctx->user, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); if (ctx->submitter_task) put_task_struct(ctx->submitter_task); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) static_branch_dec(&io_key_has_sqarray); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); WARN_ON_ONCE(ctx->nr_req_allocated); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } static __cold void io_activate_pollwq_cb(struct callback_head *cb) { struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, poll_wq_task_work); mutex_lock(&ctx->uring_lock); ctx->poll_activated = true; mutex_unlock(&ctx->uring_lock); /* * Wake ups for some events between start of polling and activation * might've been lost due to loose synchronisation. */ wake_up_all(&ctx->poll_wq); percpu_ref_put(&ctx->refs); } __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ if (ctx->poll_activated || ctx->poll_wq_task_work.func) goto out; if (WARN_ON_ONCE(!ctx->task_complete)) goto out; if (!ctx->submitter_task) goto out; /* * with ->submitter_task only the submitter task completes requests, we * only need to sync with it, which is done by injecting a tw */ init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); percpu_ref_get(&ctx->refs); if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) percpu_ref_put(&ctx->refs); out: spin_unlock(&ctx->completion_lock); } static __poll_t io_uring_poll(struct file *file, poll_table *wait) { struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); /* * provides mb() which pairs with barrier from wq_has_sleeper * call in io_commit_cqring */ poll_wait(file, &ctx->poll_wq, wait); if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Don't flush cqring overflow list here, just do a simple check. * Otherwise there could possible be ABBA deadlock: * CPU0 CPU1 * ---- ---- * lock(&ctx->uring_lock); * lock(&ep->mtx); * lock(&ctx->uring_lock); * lock(&ep->mtx); * * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushes them to do the flush. */ if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } struct io_tctx_exit { struct callback_head task_work; struct completion completion; struct io_ring_ctx *ctx; }; static __cold void io_tctx_exit_cb(struct callback_head *cb) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_exit *work; work = container_of(cb, struct io_tctx_exit, task_work); /* * When @in_cancel, we're in cancellation and it's racy to remove the * node. It'll be removed by the end of cancellation, just ignore it. * tctx can be NULL if the queueing of this task_work raced with * work cancelation off the exec path. */ if (tctx && !atomic_read(&tctx->in_cancel)) io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ do { if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { mutex_lock(&ctx->uring_lock); io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); /* The SQPOLL thread never reaches this path */ while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; io_sq_thread_park(sqd); tsk = sqpoll_task_locked(sqd); if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); io_sq_thread_unpark(sqd); } io_req_caches_free(ctx); if (WARN_ON_ONCE(time_after(jiffies, timeout))) { /* there is little hope left, don't run it too often */ interval = HZ * 60; } /* * This is really an uninterruptible wait, as it has to be * complete. But it's also run from a kworker, which doesn't * take signals, so it's fine to make it interruptible. This * avoids scenarios where we knowingly can wait much longer * on completions, for example if someone does a SIGSTOP on * a task that needs to finish task_work to make this loop * complete. That's a synthetic situation that should not * cause a stuck task backtrace, and hence a potential panic * on stuck tasks if that is enabled. */ } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); exit.ctx = ctx; mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->tctx_lock); while (!list_empty(&ctx->tctx_list)) { WARN_ON_ONCE(time_after(jiffies, timeout)); node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); /* don't spin on a single task if cancellation failed */ list_rotate_left(&ctx->tctx_list); ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); if (WARN_ON_ONCE(ret)) continue; mutex_unlock(&ctx->tctx_lock); mutex_unlock(&ctx->uring_lock); /* * See comment above for * wait_for_completion_interruptible_timeout() on why this * wait is marked as interruptible. */ wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->tctx_lock); } mutex_unlock(&ctx->tctx_lock); mutex_unlock(&ctx->uring_lock); spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); /* pairs with RCU read section in io_req_local_work_add() */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) synchronize_rcu(); io_ring_ctx_free(ctx); } static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; struct creds *creds; mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* * Use system_dfl_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime * over using system_percpu_wq. */ queue_work(iou_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) { struct io_ring_ctx *ctx = file->private_data; file->private_data = NULL; io_ring_ctx_wait_and_kill(ctx); return 0; } static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { unsigned long size = sizeof(struct io_uring_reg_wait); unsigned long offset = (uintptr_t)uarg; unsigned long end; if (unlikely(offset % sizeof(long))) return ERR_PTR(-EFAULT); /* also protects from NULL ->cq_wait_arg as the size would be 0 */ if (unlikely(check_add_overflow(offset, size, &end) || end > ctx->cq_wait_size)) return ERR_PTR(-EFAULT); offset = array_index_nospec(offset, ctx->cq_wait_size - size); return ctx->cq_wait_arg + offset; } static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, const void __user *argp, size_t argsz) { struct io_uring_getevents_arg arg; if (!(flags & IORING_ENTER_EXT_ARG)) return 0; if (flags & IORING_ENTER_EXT_ARG_REG) return -EINVAL; if (argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; return 0; } static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, const void __user *argp, struct ext_arg *ext_arg) { const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { ext_arg->sig = (const sigset_t __user *) argp; return 0; } if (flags & IORING_ENTER_EXT_ARG_REG) { struct io_uring_reg_wait *w; if (ext_arg->argsz != sizeof(struct io_uring_reg_wait)) return -EINVAL; w = io_get_ext_arg_reg(ctx, argp); if (IS_ERR(w)) return PTR_ERR(w); if (w->flags & ~IORING_REG_WAIT_TS) return -EINVAL; ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC; ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask)); ext_arg->argsz = READ_ONCE(w->sigmask_sz); if (w->flags & IORING_REG_WAIT_TS) { ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec); ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec); ext_arg->ts_set = true; } return 0; } /* * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ if (ext_arg->argsz != sizeof(arg)) return -EINVAL; #ifdef CONFIG_64BIT if (!user_access_begin(uarg, sizeof(*uarg))) return -EFAULT; unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end); unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end); unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end); unsafe_get_user(arg.ts, &uarg->ts, uaccess_end); user_access_end(); #else if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; #endif ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; ext_arg->sig = u64_to_user_ptr(arg.sigmask); ext_arg->argsz = arg.sigmask_sz; if (arg.ts) { if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts))) return -EFAULT; ext_arg->ts_set = true; } return 0; #ifdef CONFIG_64BIT uaccess_end: user_access_end(); return -EFAULT; #endif } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const void __user *, argp, size_t, argsz) { struct io_ring_ctx *ctx; struct file *file; long ret; if (unlikely(flags & ~IORING_ENTER_FLAGS)) return -EINVAL; /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. */ if (flags & IORING_ENTER_REGISTERED_RING) { struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; if (unlikely(!file)) return -EBADF; } else { file = fget(fd); if (unlikely(!file)) return -EBADF; ret = -EOPNOTSUPP; if (unlikely(!io_is_uring_fops(file))) goto out; } ctx = file->private_data; ret = -EBADFD; if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out; /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if * we were asked to. */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) io_sqpoll_wait_sq(ctx); ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit); if (ret != to_submit) { mutex_unlock(&ctx->uring_lock); goto out; } if (flags & IORING_ENTER_GETEVENTS) { if (ctx->syscall_iopoll) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and * it should handle ownership problems if any. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) (void)io_run_local_work_locked(ctx, min_complete); } mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { int ret2; if (ctx->syscall_iopoll) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to * prevent racing with polled issue that got punted to * a workqueue. */ mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); mutex_unlock(&ctx->uring_lock); } else { struct ext_arg ext_arg = { .argsz = argsz }; ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); if (likely(!ret2)) ret2 = io_cqring_wait(ctx, min_complete, flags, &ext_arg); } if (!ret) { ret = ret2; /* * EBADR indicates that one or more CQE were dropped. * Once the user has been informed we can clear the bit * as they are obviously ok with those drops. */ if (unlikely(ret2 == -EBADR)) clear_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); } } out: if (!(flags & IORING_ENTER_REGISTERED_RING)) fput(file); return ret; } static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, .get_unmapped_area = io_uring_get_unmapped_area, #ifndef CONFIG_MMU .mmap_capabilities = io_uring_nommu_mmap_capabilities, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = io_uring_show_fdinfo, #endif }; bool io_is_uring_fops(struct file *file) { return file->f_op == &io_uring_fops; } static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_ctx_config *config) { struct io_uring_params *p = &config->p; struct io_rings_layout *rl = &config->layout; struct io_uring_region_desc rd; struct io_rings *rings; int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(rl->rings_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); if (ret) return ret; ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(rl->sq_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); if (ret) { io_rings_free(ctx); return ret; } ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); memset(rings, 0, sizeof(*rings)); WRITE_ONCE(rings->sq_ring_mask, ctx->sq_entries - 1); WRITE_ONCE(rings->cq_ring_mask, ctx->cq_entries - 1); WRITE_ONCE(rings->sq_ring_entries, ctx->sq_entries); WRITE_ONCE(rings->cq_ring_entries, ctx->cq_entries); return 0; } static int io_uring_install_fd(struct file *file) { int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; fd_install(fd, file); return fd; } /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this * fd to gain access to the SQ/CQ ring details. */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { /* Create a new inode so that the LSM can block the creation. */ return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); } static int io_uring_sanitise_params(struct io_uring_params *p) { unsigned flags = p->flags; if (flags & ~IORING_SETUP_FLAGS) return -EINVAL; /* There is no way to mmap rings without a real fd */ if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(flags & IORING_SETUP_NO_MMAP)) return -EINVAL; if (flags & IORING_SETUP_SQPOLL) { /* IPI related flags don't make sense with SQPOLL */ if (flags & (IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_DEFER_TASKRUN)) return -EINVAL; } if (flags & IORING_SETUP_TASKRUN_FLAG) { if (!(flags & (IORING_SETUP_COOP_TASKRUN | IORING_SETUP_DEFER_TASKRUN))) return -EINVAL; } /* HYBRID_IOPOLL only valid with IOPOLL */ if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) return -EINVAL; /* * For DEFER_TASKRUN we require the completion task to be the same as * the submission task. This implies that there is only one submitter. */ if ((flags & IORING_SETUP_DEFER_TASKRUN) && !(flags & IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; /* * Nonsensical to ask for CQE32 and mixed CQE support, it's not * supported to post 16b CQEs on a ring setup with CQE32. */ if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) return -EINVAL; /* * Nonsensical to ask for SQE128 and mixed SQE support, it's not * supported to post 64b SQEs on a ring setup with SQE128. */ if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) return -EINVAL; return 0; } static int io_uring_fill_params(struct io_uring_params *p) { unsigned entries = p->sq_entries; if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { if (!(p->flags & IORING_SETUP_CLAMP)) return -EINVAL; entries = IORING_MAX_ENTRIES; } /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, * since the sqes are only used at submission time. This allows for * some flexibility in overcommitting a bit. If the application has * set IORING_SETUP_CQSIZE, it will have passed in the desired number * of CQ ring entries manually. */ p->sq_entries = roundup_pow_of_two(entries); if (p->flags & IORING_SETUP_CQSIZE) { /* * If IORING_SETUP_CQSIZE is set, we do the same roundup * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ if (!p->cq_entries) return -EINVAL; if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { if (!(p->flags & IORING_SETUP_CLAMP)) return -EINVAL; p->cq_entries = IORING_MAX_CQ_ENTRIES; } p->cq_entries = roundup_pow_of_two(p->cq_entries); if (p->cq_entries < p->sq_entries) return -EINVAL; } else { p->cq_entries = 2 * p->sq_entries; } return 0; } int io_prepare_config(struct io_ctx_config *config) { struct io_uring_params *p = &config->p; int ret; ret = io_uring_sanitise_params(p); if (ret) return ret; ret = io_uring_fill_params(p); if (ret) return ret; ret = rings_size(p->flags, p->sq_entries, p->cq_entries, &config->layout); if (ret) return ret; p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); p->sq_off.resv1 = 0; if (!(p->flags & IORING_SETUP_NO_MMAP)) p->sq_off.user_addr = 0; p->cq_off.head = offsetof(struct io_rings, cq.head); p->cq_off.tail = offsetof(struct io_rings, cq.tail); p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); p->cq_off.flags = offsetof(struct io_rings, cq_flags); p->cq_off.resv1 = 0; if (!(p->flags & IORING_SETUP_NO_MMAP)) p->cq_off.user_addr = 0; if (!(p->flags & IORING_SETUP_NO_SQARRAY)) p->sq_off.array = config->layout.sq_array_offset; return 0; } static __cold int io_uring_create(struct io_ctx_config *config) { struct io_uring_params *p = &config->p; struct io_ring_ctx *ctx; struct io_uring_task *tctx; struct file *file; int ret; ret = io_prepare_config(config); if (ret) return ret; ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; ctx->clockid = CLOCK_MONOTONIC; ctx->clock_offset = 0; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) static_branch_inc(&io_key_has_sqarray); if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->task_complete = true; if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) ctx->lockless_cq = true; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ if (!ctx->task_complete) ctx->poll_activated = true; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events * polling again, they can rely on io_sq_thread to do polling * work, which can reduce cpu usage and uring_lock contention. */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->syscall_iopoll = 1; ctx->compat = in_compat_syscall(); if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; else ctx->notify_method = TWA_SIGNAL; /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount * memory (locked/pinned vm). It's not used for anything else. */ mmgrab(current->mm); ctx->mm_account = current->mm; ret = io_allocate_scq_urings(ctx, config); if (ret) goto err; ret = io_sq_offload_create(ctx, p); if (ret) goto err; p->features = IORING_FEAT_FLAGS; if (copy_to_user(config->uptr, p, sizeof(*p))) { ret = -EFAULT; goto err; } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !(ctx->flags & IORING_SETUP_R_DISABLED)) { /* * Unlike io_register_enable_rings(), don't need WRITE_ONCE() * since ctx isn't yet accessible from other tasks */ ctx->submitter_task = get_task_struct(current); } file = io_uring_get_file(ctx); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err; } ret = __io_uring_add_tctx_node(ctx); if (ret) goto err_fput; tctx = current->io_uring; /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); else ret = io_uring_install_fd(file); if (ret < 0) goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; err_fput: fput(file); return ret; } /* * Sets up an aio uring context, and returns the fd. Applications asks for a * ring size, we return the actual sq/cq ring sizes (among other things) in the * params structure passed in. */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_ctx_config config; memset(&config, 0, sizeof(config)); if (copy_from_user(&config.p, params, sizeof(config.p))) return -EFAULT; if (!mem_is_zero(&config.p.resv, sizeof(config.p.resv))) return -EINVAL; config.p.sq_entries = entries; config.uptr = params; return io_uring_create(&config); } static inline int io_uring_allowed(void) { int disabled = READ_ONCE(sysctl_io_uring_disabled); kgid_t io_uring_group; if (disabled == 2) return -EPERM; if (disabled == 0 || capable(CAP_SYS_ADMIN)) goto allowed_lsm; io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); if (!gid_valid(io_uring_group)) return -EPERM; if (!in_group_p(io_uring_group)) return -EPERM; allowed_lsm: return security_uring_allowed(); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { int ret; ret = io_uring_allowed(); if (ret) return ret; return io_uring_setup(entries, params); } static int __init io_uring_init(void) { struct kmem_cache_args kmem_args = { .useroffset = offsetof(struct io_kiocb, cmd.data), .usersize = sizeof_field(struct io_kiocb, cmd.data), .freeptr_offset = offsetof(struct io_kiocb, work), .use_freeptr_offset = true, }; #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ } while (0) #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename) #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \ __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename) BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); BUILD_BUG_SQE_ELEM(0, __u8, opcode); BUILD_BUG_SQE_ELEM(1, __u8, flags); BUILD_BUG_SQE_ELEM(2, __u16, ioprio); BUILD_BUG_SQE_ELEM(4, __s32, fd); BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(8, __u32, cmd_op); BUILD_BUG_SQE_ELEM(12, __u32, __pad1); BUILD_BUG_SQE_ELEM(16, __u64, addr); BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(28, __u32, rename_flags); BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags); BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags); BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); BUILD_BUG_SQE_ELEM(44, __u8, write_stream); BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > sizeof(struct io_uring_rsrc_update2)); /* ->buf_index is u16 */ BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); /* top 8bits are for internal use */ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); io_uring_optable_init(); /* imu->dir is u8 */ BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling * requires copying in user memory into the io_kiocb object in that * range, and HARDENED_USERCOPY will complain if we haven't * correctly annotated this range. */ req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); #endif return 0; }; __initcall(io_uring_init);
422 421 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/tty_port.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; struct tty_operations; /** * struct tty_struct - state associated with a tty while open * * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero * frees the structure * @dev: class device or %NULL (e.g. ptys, serdev) * @driver: &struct tty_driver operating this tty * @ops: &struct tty_operations of @driver for this tty (open, close, etc.) * @index: index of this tty (e.g. to construct @name like tty12) * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty * @ldisc: the current line discipline for this tty (n_tty by default) * @atomic_write_lock: protects against concurrent writers, i.e. locks * @write_cnt, @write_buf and similar * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex), * protecting several operations on this tty * @throttle_mutex: protects against concurrent tty_throttle_safe() and * tty_unthrottle_safe() (but not tty_unthrottle()) * @termios_rwsem: protects @termios and @termios_locked * @winsize_mutex: protects @winsize * @termios: termios for the current tty, copied from/to @driver.termios * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS * ioctls) * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3) * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ... * @count: count of open processes, reaching zero cancels all the work for * this tty and drops a @kref too (but does not free this tty) * @winsize: size of the terminal "window" (cf. @winsize_mutex) * @flow: flow settings grouped together * @flow.lock: lock for @flow members * @flow.stopped: tty stopped/started by stop_tty()/start_tty() * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has * precedence over @flow.stopped) * @ctrl: control settings grouped together * @ctrl.lock: lock for @ctrl members * @ctrl.pgrp: process group of this tty (setpgrp(2)) * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both * @ctrl.lock and @legacy_mutex, readers must use at least one of * them. * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants) * @ctrl.packet: packet mode enabled * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS * handling * @receive_room: bytes permitted to feed to @ldisc without any being lost * @flow_change: controls behavior of throttling, see tty_throttle_safe() and * tty_unthrottle_safe() * @link: link to another pty (master -> slave and vice versa) * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper() * @write_wait: concurrent writers are waiting in this queue until they are * allowed to write * @read_wait: readers wait for data in this queue * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while * freeing the tty, (re)used to release_one_tty() * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data) * @driver_data: pointer to @driver's private data (e.g. &struct uart_state) * @files_lock: protects @tty_files list * @tty_files: list of (re)openers of this tty (i.e. linked &struct * tty_file_private) * @closing: when set during close, n_tty processes only START & STOP chars * @write_buf: temporary buffer used during tty_write() to copy user data to * @write_cnt: count of bytes written in tty_write() to @write_buf * @SAK_work: if the tty has a pending do_SAK, it is queued here * @port: persistent storage for this device (i.e. &struct tty_port) * * All of the state associated with a tty while the tty is open. Persistent * storage for tty devices is referenced here as @port and is documented in * &struct tty_port. */ struct tty_struct { struct kref kref; int index; struct device *dev; struct tty_driver *driver; struct tty_port *port; const struct tty_operations *ops; struct tty_ldisc *ldisc; struct ld_semaphore ldisc_sem; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; struct ktermios termios, termios_locked; char name[64]; unsigned long flags; int count; unsigned int receive_room; struct winsize winsize; struct { spinlock_t lock; bool stopped; bool tco_stopped; } flow; struct { struct pid *pgrp; struct pid *session; spinlock_t lock; unsigned char pktstatus; bool packet; } ctrl; bool hw_stopped; bool closing; int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; int write_cnt; u8 *write_buf; struct list_head tty_files; struct work_struct SAK_work; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /** * enum tty_struct_flags - TTY Struct Flags * * These bits are used in the :c:member:`tty_struct.flags` field. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. * * @TTY_THROTTLED: * Driver input is throttled. The ldisc should call * :c:member:`tty_driver.unthrottle()` in order to resume reception when * it is ready to process more data (at threshold min). * * @TTY_IO_ERROR: * If set, causes all subsequent userspace read/write calls on the tty to * fail, returning -%EIO. (May be no ldisc too.) * * @TTY_OTHER_CLOSED: * Device is a pty and the other side has closed. * * @TTY_EXCLUSIVE: * Exclusive open mode (a single opener). * * @TTY_DO_WRITE_WAKEUP: * If set, causes the driver to call the * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume * transmission when it can accept more data to transmit. * * @TTY_LDISC_OPEN: * Indicates that a line discipline is open. For debugging purposes only. * * @TTY_PTY_LOCK: * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. * * @TTY_NO_WRITE_SPLIT: * Prevent driver from splitting up writes into smaller chunks (preserve * write boundaries to driver). * * @TTY_HUPPED: * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. * * @TTY_HUPPING: * The TTY is in the process of hanging up to abort potential readers. * * @TTY_LDISC_CHANGING: * Line discipline for this TTY is being changed. I/O should not block * when this is set. Use tty_io_nonblock() to check. * * @TTY_LDISC_HALTED: * Line discipline for this TTY was stopped. No work should be queued to * this ldisc. */ enum tty_struct_flags { TTY_THROTTLED, TTY_IO_ERROR, TTY_OTHER_CLOSED, TTY_EXCLUSIVE, TTY_DO_WRITE_WAKEUP, TTY_LDISC_OPEN, TTY_PTY_LOCK, TTY_NO_WRITE_SPLIT, TTY_HUPPED, TTY_HUPPING, TTY_LDISC_CHANGING, TTY_LDISC_HALTED, }; static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY void tty_kref_put(struct tty_struct *tty); struct pid *tty_get_pgrp(struct tty_struct *tty); void tty_vhangup_self(void); void disassociate_ctty(int priv); dev_t tty_devnum(struct tty_struct *tty); void proc_clear_tty(struct task_struct *p); struct tty_struct *get_current_tty(void); /* tty_io.c */ int __init tty_init(void); const char *tty_name(const struct tty_struct *tty); struct tty_struct *tty_kopen_exclusive(dev_t device); struct tty_struct *tty_kopen_shared(dev_t device); void tty_kclose(struct tty_struct *tty); int tty_dev_name_to_number(const char *name, dev_t *number); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen_exclusive(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; int vcs_init(void); extern const struct class tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Returns: a new reference to a tty object * * Locking: The caller must hold sufficient locks/counts to ensure that their * existing reference cannot go away. */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } const char *tty_driver_name(const struct tty_struct *tty); void tty_wait_until_sent(struct tty_struct *tty, long timeout); void stop_tty(struct tty_struct *tty); void start_tty(struct tty_struct *tty); void tty_write_message(struct tty_struct *tty, char *msg); int tty_send_xchar(struct tty_struct *tty, u8 ch); int tty_put_char(struct tty_struct *tty, u8 c); unsigned int tty_chars_in_buffer(struct tty_struct *tty); unsigned int tty_write_room(struct tty_struct *tty); void tty_driver_flush_buffer(struct tty_struct *tty); void tty_unthrottle(struct tty_struct *tty); bool tty_throttle_safe(struct tty_struct *tty); bool tty_unthrottle_safe(struct tty_struct *tty); int tty_do_resize(struct tty_struct *tty, struct winsize *ws); int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount); int tty_get_tiocm(struct tty_struct *tty); int is_current_pgrp_orphaned(void); void tty_hangup(struct tty_struct *tty); void tty_vhangup(struct tty_struct *tty); int tty_hung_up_p(struct file *filp); void do_SAK(struct tty_struct *tty); void __do_SAK(struct tty_struct *tty); void no_tty(void); speed_t tty_termios_baud_rate(const struct ktermios *termios); void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns: the baud rate as an integer for this terminal * * Locking: The termios lock must be held by the caller. */ static inline speed_t tty_get_baud_rate(const struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } unsigned char tty_get_char_size(unsigned int cflag); unsigned char tty_get_frame_size(unsigned int cflag); void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old); bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); void tty_wakeup(struct tty_struct *tty); int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); int tty_perform_flush(struct tty_struct *tty, unsigned long arg); struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); void tty_release_struct(struct tty_struct *tty, int idx); void tty_init_termios(struct tty_struct *tty); void tty_save_termios(struct tty_struct *tty); int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; /* n_tty.c */ void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT void tty_audit_exit(void); void tty_audit_fork(struct signal_struct *sig); int tty_audit_push(void); #else static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* vt.c */ int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ void tty_lock(struct tty_struct *tty); int tty_lock_interruptible(struct tty_struct *tty); void tty_unlock(struct tty_struct *tty); void tty_lock_slave(struct tty_struct *tty); void tty_unlock_slave(struct tty_struct *tty); void tty_set_lock_subclass(struct tty_struct *tty); #endif
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0-only /* * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): * Murali Karicheri <m-karicheri2@ti.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/debugfs.h> #include "hsr_main.h" #include "hsr_framereg.h" static struct dentry *hsr_debugfs_root_dir; /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; seq_printf(sfp, "Node Table entries for (%s) device\n", (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); seq_puts(sfp, "time_in[B], Address-B port, "); if (priv->prot_version == PRP_V1) seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); else seq_puts(sfp, "DAN-H\n"); rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; seq_printf(sfp, "%pM ", &node->macaddress_A[0]); seq_printf(sfp, "%pM ", &node->macaddress_B[0]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); seq_printf(sfp, "%14x, ", node->addr_B_port); if (priv->prot_version == PRP_V1) seq_printf(sfp, "%5x, %5x, %5x\n", node->san_a, node->san_b, (node->san_a == 0 && node->san_b == 0)); else seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); int err; err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name); if (err) netdev_warn(dev, "failed to rename\n"); } /* hsr_debugfs_init - create hsr node_table file for dumping * the node table * * Description: * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs directory\n"); return; } priv->node_tbl_root = de; de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_node_table_fops); if (IS_ERR(de)) { pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; } } /* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: * When Debugfs is configured this routine removes debugfs file system * elements that are specific to hsr */ void hsr_debugfs_term(struct hsr_priv *priv) { debugfs_remove_recursive(priv->node_tbl_root); priv->node_tbl_root = NULL; } void hsr_debugfs_create_root(void) { hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); if (IS_ERR(hsr_debugfs_root_dir)) { pr_err("Cannot create hsr debugfs root directory\n"); hsr_debugfs_root_dir = NULL; } } void hsr_debugfs_remove_root(void) { /* debugfs_remove() internally checks NULL and ERROR */ debugfs_remove(hsr_debugfs_root_dir); }
772 773 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COOKIE_H #define __LINUX_COOKIE_H #include <linux/atomic.h> #include <linux/percpu.h> #include <asm/local.h> struct pcpu_gen_cookie { local_t nesting; u64 last; } __aligned(16); struct gen_cookie { struct pcpu_gen_cookie __percpu *local; atomic64_t forward_last ____cacheline_aligned_in_smp; atomic64_t reverse_last; }; #define COOKIE_LOCAL_BATCH 4096 #define DEFINE_COOKIE(name) \ static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \ static struct gen_cookie name = { \ .local = &__##name, \ .forward_last = ATOMIC64_INIT(0), \ .reverse_last = ATOMIC64_INIT(0), \ } static __always_inline u64 gen_cookie_next(struct gen_cookie *gc) { struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local); u64 val; if (likely(local_inc_return(&local->nesting) == 1)) { val = local->last; if (__is_defined(CONFIG_SMP) && unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) { s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH, &gc->forward_last); val = next - COOKIE_LOCAL_BATCH; } local->last = ++val; } else { val = atomic64_dec_return(&gc->reverse_last); } local_dec(&local->nesting); return val; } #endif /* __LINUX_COOKIE_H */
23 23 33 33 33 33 33 23 23 18 2 3 33 33 33 1 2 2 33 33 33 8 8 8 8 8 8 8 8 33 33 32 33 33 174 175 33 33 31 32 33 15 18 33 33 33 9 8 60 1 60 46 36 36 35 32 32 32 26 26 72 78 79 17 72 71 72 72 79 46 46 33 46 46 46 46 84 79 46 79 46 116 139 139 138 139 139 139 2 2 2 2 4 4 2 2 2 12 12 2 2 2 1 23 2 12 1 2 1 8 6 2 1 1 2 4 32 32 3 23 6 33 11 24 15 15 15 22 22 22 15 15 15 15 15 22 83 32 5 23 35 36 22 22 22 9 9 9 9 48 1 8 8 4 2 2 2 30 30 19 19 9 17 17 17 19 19 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge multicast support. * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/err.h> #include <linux/export.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/log2.h> #include <linux/netdevice.h> #include <linux/netfilter_bridge.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/inetdevice.h> #include <linux/mroute.h> #include <net/ip.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #endif #include <trace/events/bridge.h> #include "br_private.h" #include "br_private_mcast_eht.h" static const struct rhashtable_params br_mdb_rht_params = { .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode), .key_offset = offsetof(struct net_bridge_mdb_entry, addr), .key_len = sizeof(struct br_ip), .automatic_shrinking = true, }; static const struct rhashtable_params br_sg_port_rht_params = { .head_offset = offsetof(struct net_bridge_port_group, rhnode), .key_offset = offsetof(struct net_bridge_port_group, key), .key_len = sizeof(struct net_bridge_port_group_sg_key), .automatic_shrinking = true, }; static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query); static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx); static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); static void br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted); static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx); #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif static struct net_bridge_port_group * __br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1, bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); static void __br_multicast_stop(struct net_bridge_mcast *brmctx); static int br_mc_disabled_update(struct net_device *dev, bool value, struct netlink_ext_ack *extack); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, struct net_bridge_port_group_sg_key *sg_p) { lockdep_assert_held_once(&br->multicast_lock); return rhashtable_lookup_fast(&br->sg_port_tbl, sg_p, br_sg_port_rht_params); } static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, struct br_ip *dst) { return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); } struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst) { struct net_bridge_mdb_entry *ent; lockdep_assert_held_once(&br->multicast_lock); rcu_read_lock(); ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); rcu_read_unlock(); return ent; } static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br, __be32 dst, __u16 vid) { struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); br_dst.dst.ip4 = dst; br_dst.proto = htons(ETH_P_IP); br_dst.vid = vid; return br_mdb_ip_get(br, &br_dst); } #if IS_ENABLED(CONFIG_IPV6) static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, const struct in6_addr *dst, __u16 vid) { struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); br_dst.dst.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); br_dst.vid = vid; return br_mdb_ip_get(br, &br_dst); } #endif struct net_bridge_mdb_entry * br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { struct net_bridge *br = brmctx->br; struct br_ip ip; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || br_multicast_ctx_vlan_global_disabled(brmctx)) return NULL; if (BR_INPUT_SKB_CB(skb)->igmp) return NULL; memset(&ip, 0, sizeof(ip)); ip.proto = skb->protocol; ip.vid = vid; switch (skb->protocol) { case htons(ETH_P_IP): ip.dst.ip4 = ip_hdr(skb)->daddr; if (brmctx->multicast_igmp_version == 3) { struct net_bridge_mdb_entry *mdb; ip.src.ip4 = ip_hdr(skb)->saddr; mdb = br_mdb_ip_get_rcu(br, &ip); if (mdb) return mdb; ip.src.ip4 = 0; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip.dst.ip6 = ipv6_hdr(skb)->daddr; if (brmctx->multicast_mld_version == 2) { struct net_bridge_mdb_entry *mdb; ip.src.ip6 = ipv6_hdr(skb)->saddr; mdb = br_mdb_ip_get_rcu(br, &ip); if (mdb) return mdb; memset(&ip.src.ip6, 0, sizeof(ip.src.ip6)); } break; #endif default: ip.proto = 0; ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest); } return br_mdb_ip_get_rcu(br, &ip); } /* IMPORTANT: this function must be used only when the contexts cannot be * passed down (e.g. timer) and must be used for read-only purposes because * the vlan snooping option can change, so it can return any context * (non-vlan or vlan). Its initial intended purpose is to read timer values * from the *current* context based on the option. At worst that could lead * to inconsistent timers when the contexts are changed, i.e. src timer * which needs to re-arm with a specific delay taken from the old context */ static struct net_bridge_mcast_port * br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg) { struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx; struct net_bridge_vlan *vlan; lockdep_assert_held_once(&pg->key.port->br->multicast_lock); /* if vlan snooping is disabled use the port's multicast context */ if (!pg->key.addr.vid || !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) goto out; /* locking is tricky here, due to different rules for multicast and * vlans we need to take rcu to find the vlan and make sure it has * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under * multicast_lock which must be already held here, so the vlan's pmctx * can safely be used on return */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(pg->key.port), pg->key.addr.vid); if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) pmctx = &vlan->port_mcast_ctx; else pmctx = NULL; rcu_read_unlock(); out: return pmctx; } static struct net_bridge_mcast_port * br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid) { struct net_bridge_mcast_port *pmctx = NULL; struct net_bridge_vlan *vlan; lockdep_assert_held_once(&port->br->multicast_lock); if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) return NULL; /* Take RCU to access the vlan. */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid); if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) pmctx = &vlan->port_mcast_ctx; rcu_read_unlock(); return pmctx; } /* when snooping we need to check if the contexts should be used * in the following order: * - if pmctx is non-NULL (port), check if it should be used * - if pmctx is NULL (bridge), check if brmctx should be used */ static bool br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx, const struct net_bridge_mcast_port *pmctx) { if (!netif_running(brmctx->br->dev)) return false; if (pmctx) return !br_multicast_port_ctx_state_disabled(pmctx); else return !br_multicast_ctx_vlan_disabled(brmctx); } static bool br_port_group_equal(struct net_bridge_port_group *p, struct net_bridge_port *port, const unsigned char *src) { if (p->key.port != port) return false; if (!(port->flags & BR_MULTICAST_TO_UNICAST)) return true; return ether_addr_equal(src, p->eth_addr); } static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge_port_group *src_pg; struct net_bridge_mcast *brmctx; memset(&sg_key, 0, sizeof(sg_key)); brmctx = br_multicast_port_ctx_get_global(pmctx); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; if (br_sg_port_find(brmctx->br, &sg_key)) return; src_pg = __br_multicast_add_group(brmctx, pmctx, sg_ip, pg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) return; src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; } static void __fwd_del_star_excl(struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *src_pg; memset(&sg_key, 0, sizeof(sg_key)); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; src_pg = br_sg_port_find(br, &sg_key); if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) || src_pg->rt_protocol != RTPROT_KERNEL) return; br_multicast_find_del_pg(br, src_pg); } /* When a port group transitions to (or is added as) EXCLUDE we need to add it * to all other ports' S,G entries which are not blocked by the current group * for proper replication, the assumption is that any S,G blocked entries * are already added so the S,G,port lookup should skip them. * When a port group transitions from EXCLUDE -> INCLUDE mode or is being * deleted we need to remove it from all ports' S,G entries where it was * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL). */ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode) { struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *pg_lst; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr))) return; mp = br_mdb_ip_get(br, &pg->key.addr); if (!mp) return; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; for (pg_lst = mlock_dereference(mp->ports, br); pg_lst; pg_lst = mlock_dereference(pg_lst->next, br)) { struct net_bridge_group_src *src_ent; if (pg_lst == pg) continue; hlist_for_each_entry(src_ent, &pg_lst->src_list, node) { if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) continue; sg_ip.src = src_ent->addr.src; switch (filter_mode) { case MCAST_INCLUDE: __fwd_del_star_excl(pg, &sg_ip); break; case MCAST_EXCLUDE: __fwd_add_star_excl(pmctx, pg, &sg_ip); break; } } } } /* called when adding a new S,G with host_joined == false by default */ static void br_multicast_sg_host_state(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg) { struct net_bridge_mdb_entry *sg_mp; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; if (!star_mp->host_joined) return; sg_mp = br_mdb_ip_get(star_mp->br, &sg->key.addr); if (!sg_mp) return; sg_mp->host_joined = true; } /* set the host_joined state of all of *,G's S,G entries */ static void br_multicast_star_g_host_state(struct net_bridge_mdb_entry *star_mp) { struct net_bridge *br = star_mp->br; struct net_bridge_mdb_entry *sg_mp; struct net_bridge_port_group *pg; struct br_ip sg_ip; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = star_mp->addr; for (pg = mlock_dereference(star_mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) { struct net_bridge_group_src *src_ent; hlist_for_each_entry(src_ent, &pg->src_list, node) { if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) continue; sg_ip.src = src_ent->addr.src; sg_mp = br_mdb_ip_get(br, &sg_ip); if (!sg_mp) continue; sg_mp->host_joined = star_mp->host_joined; } } } static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; /* *,G exclude ports are only added to S,G entries */ if (WARN_ON(br_multicast_is_star_g(&sgmp->addr))) return; /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports * we should ignore perm entries since they're managed by user-space */ for (pp = &sgmp->ports; (p = mlock_dereference(*pp, sgmp->br)) != NULL; pp = &p->next) if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL | MDB_PG_FLAGS_PERMANENT))) return; /* currently the host can only have joined the *,G which means * we treat it as EXCLUDE {}, so for an S,G it's considered a * STAR_EXCLUDE entry and we can safely leave it */ sgmp->host_joined = false; for (pp = &sgmp->ports; (p = mlock_dereference(*pp, sgmp->br)) != NULL;) { if (!(p->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_del_pg(sgmp, p, pp); else pp = &p->next; } } void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = star_mp->br; struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *pg; struct net_bridge_mcast *brmctx; if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) return; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; br_multicast_sg_host_state(star_mp, sg); memset(&sg_key, 0, sizeof(sg_key)); sg_key.addr = sg->key.addr; /* we need to add all exclude ports to the S,G */ for (pg = mlock_dereference(star_mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) { struct net_bridge_port_group *src_pg; if (pg == sg || pg->filter_mode == MCAST_INCLUDE) continue; sg_key.port = pg->key.port; if (br_sg_port_find(br, &sg_key)) continue; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) continue; brmctx = br_multicast_port_ctx_get_global(pmctx); src_pg = __br_multicast_add_group(brmctx, pmctx, &sg->key.addr, sg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) continue; src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; } } static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { struct net_bridge_mdb_entry *star_mp; struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *sg; struct net_bridge_mcast *brmctx; struct br_ip sg_ip; if (src->flags & BR_SGRP_F_INSTALLED) return; memset(&sg_ip, 0, sizeof(sg_ip)); pmctx = br_multicast_pg_to_port_ctx(src->pg); if (!pmctx) return; brmctx = br_multicast_port_ctx_get_global(pmctx); sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip, src->pg->eth_addr, MCAST_INCLUDE, false, !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) return; src->flags |= BR_SGRP_F_INSTALLED; sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL; /* if it was added by user-space as perm we can skip next steps */ if (sg->rt_protocol != RTPROT_KERNEL && (sg->flags & MDB_PG_FLAGS_PERMANENT)) return; /* the kernel is now responsible for removing this S,G */ timer_delete(&sg->timer); star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr); if (!star_mp) return; br_multicast_sg_add_exclude_ports(star_mp, sg); } static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src, bool fastleave) { struct net_bridge_port_group *p, *pg = src->pg; struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; sg_ip.src = src->addr.src; mp = br_mdb_ip_get(src->br, &sg_ip); if (!mp) return; for (pp = &mp->ports; (p = mlock_dereference(*pp, src->br)) != NULL; pp = &p->next) { if (!br_port_group_equal(p, pg->key.port, pg->eth_addr)) continue; if (p->rt_protocol != RTPROT_KERNEL && (p->flags & MDB_PG_FLAGS_PERMANENT) && !(src->flags & BR_SGRP_F_USER_ADDED)) break; if (fastleave) p->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_del_pg(mp, p, pp); break; } src->flags &= ~BR_SGRP_F_INSTALLED; } /* install S,G and based on src's timer enable or disable forwarding */ static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge_port_group *sg; u8 old_flags; br_multicast_fwd_src_add(src); memset(&sg_key, 0, sizeof(sg_key)); sg_key.addr = src->pg->key.addr; sg_key.addr.src = src->addr.src; sg_key.port = src->pg->key.port; sg = br_sg_port_find(src->br, &sg_key); if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT)) return; old_flags = sg->flags; if (timer_pending(&src->timer)) sg->flags &= ~MDB_PG_FLAGS_BLOCKED; else sg->flags |= MDB_PG_FLAGS_BLOCKED; if (old_flags != sg->flags) { struct net_bridge_mdb_entry *sg_mp; sg_mp = br_mdb_ip_get(src->br, &sg_key.addr); if (!sg_mp) return; br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB); } } static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) { struct net_bridge_mdb_entry *mp; mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc); WARN_ON(!hlist_unhashed(&mp->mdb_node)); WARN_ON(mp->ports); timer_shutdown_sync(&mp->timer); kfree_rcu(mp, rcu); } static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp) { struct net_bridge *br = mp->br; rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, br_mdb_rht_params); hlist_del_init_rcu(&mp->mdb_node); hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } static void br_multicast_group_expired(struct timer_list *t) { struct net_bridge_mdb_entry *mp = timer_container_of(mp, t, timer); struct net_bridge *br = mp->br; spin_lock(&br->multicast_lock); if (hlist_unhashed(&mp->mdb_node) || !netif_running(br->dev) || timer_pending(&mp->timer)) goto out; br_multicast_host_leave(mp, true); if (mp->ports) goto out; br_multicast_del_mdb_entry(mp); out: spin_unlock(&br->multicast_lock); } static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) { struct net_bridge_group_src *src; src = container_of(gc, struct net_bridge_group_src, mcast_gc); WARN_ON(!hlist_unhashed(&src->node)); timer_shutdown_sync(&src->timer); kfree_rcu(src, rcu); } void __br_multicast_del_group_src(struct net_bridge_group_src *src) { struct net_bridge *br = src->pg->key.port->br; hlist_del_init_rcu(&src->node); src->pg->src_ents--; hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave) { br_multicast_fwd_src_remove(src, fastleave); __br_multicast_del_group_src(src); } static int br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx, struct netlink_ext_ack *extack, const char *what) { u32 max = READ_ONCE(pmctx->mdb_max_entries); u32 n = READ_ONCE(pmctx->mdb_n_entries); if (max && n >= max) { NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u", what, n, max); return -E2BIG; } WRITE_ONCE(pmctx->mdb_n_entries, n + 1); return 0; } static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx) { u32 n = READ_ONCE(pmctx->mdb_n_entries); WARN_ON_ONCE(n == 0); WRITE_ONCE(pmctx->mdb_n_entries, n - 1); } static int br_multicast_port_ngroups_inc(struct net_bridge_port *port, const struct br_ip *group, struct netlink_ext_ack *extack) { struct net_bridge_mcast_port *pmctx; int err; lockdep_assert_held_once(&port->br->multicast_lock); /* Always count on the port context. */ err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack, "Port"); if (err) { trace_br_mdb_full(port->dev, group); return err; } /* Only count on the VLAN context if VID is given, and if snooping on * that VLAN is enabled. */ if (!group->vid) return 0; pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid); if (!pmctx) return 0; err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN"); if (err) { trace_br_mdb_full(port->dev, group); goto dec_one_out; } return 0; dec_one_out: br_multicast_port_ngroups_dec_one(&port->multicast_ctx); return err; } static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid) { struct net_bridge_mcast_port *pmctx; lockdep_assert_held_once(&port->br->multicast_lock); if (vid) { pmctx = br_multicast_port_vid_to_port_ctx(port, vid); if (pmctx) br_multicast_port_ngroups_dec_one(pmctx); } br_multicast_port_ngroups_dec_one(&port->multicast_ctx); } u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx) { return READ_ONCE(pmctx->mdb_n_entries); } void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max) { WRITE_ONCE(pmctx->mdb_max_entries, max); } u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx) { return READ_ONCE(pmctx->mdb_max_entries); } static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) { struct net_bridge_port_group *pg; pg = container_of(gc, struct net_bridge_port_group, mcast_gc); WARN_ON(!hlist_unhashed(&pg->mglist)); WARN_ON(!hlist_empty(&pg->src_list)); timer_shutdown_sync(&pg->rexmit_timer); timer_shutdown_sync(&pg->timer); kfree_rcu(pg, rcu); } void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp) { struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; struct hlist_node *tmp; rcu_assign_pointer(*pp, pg->next); hlist_del_init(&pg->mglist); br_multicast_eht_clean_sets(pg); hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) br_multicast_del_group_src(ent, false); br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); if (!br_multicast_is_star_g(&mp->addr)) { rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode, br_sg_port_rht_params); br_multicast_sg_del_exclude_ports(mp); } else { br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); } br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg) { struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; mp = br_mdb_ip_get(br, &pg->key.addr); if (WARN_ON(!mp)) return; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p != pg) continue; br_multicast_del_pg(mp, pg, pp); return; } WARN_ON(1); } static void br_multicast_port_group_expired(struct timer_list *t) { struct net_bridge_port_group *pg = timer_container_of(pg, t, timer); struct net_bridge_group_src *src_ent; struct net_bridge *br = pg->key.port->br; struct hlist_node *tmp; bool changed; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT) goto out; changed = !!(pg->filter_mode == MCAST_EXCLUDE); pg->filter_mode = MCAST_INCLUDE; hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { if (!timer_pending(&src_ent->timer)) { br_multicast_del_group_src(src_ent, false); changed = true; } } if (hlist_empty(&pg->src_list)) { br_multicast_find_del_pg(br, pg); } else if (changed) { struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr); if (changed && br_multicast_is_star_g(&pg->key.addr)) br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); if (WARN_ON(!mp)) goto out; br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB); } out: spin_unlock(&br->multicast_lock); } static void br_multicast_gc(struct hlist_head *head) { struct net_bridge_mcast_gc *gcent; struct hlist_node *tmp; hlist_for_each_entry_safe(gcent, tmp, head, gc_node) { hlist_del_init(&gcent->gc_node); gcent->destroy(gcent); } } static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { struct net_bridge_vlan *vlan = NULL; if (pmctx && br_multicast_port_ctx_is_vlan(pmctx)) vlan = pmctx->vlan; else if (br_multicast_ctx_is_vlan(brmctx)) vlan = brmctx->vlan; if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) { u16 vlan_proto; if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0) return; __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid); } } static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, __be32 ip_dst, __be32 group, bool with_srcs, bool over_lmqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, igmp_hdr_size; unsigned long now = jiffies; struct igmpv3_query *ihv3; void *csum_start = NULL; __sum16 *csum = NULL; struct sk_buff *skb; struct igmphdr *ih; struct ethhdr *eth; unsigned long lmqt; struct iphdr *iph; u16 lmqt_srcs = 0; igmp_hdr_size = sizeof(*ih); if (brmctx->multicast_igmp_version == 3) { igmp_hdr_size = sizeof(*ihv3); if (pg && with_srcs) { lmqt = now + (brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && ent->src_query_rexmit_cnt > 0) lmqt_srcs++; } if (!lmqt_srcs) return NULL; igmp_hdr_size += lmqt_srcs * sizeof(__be32); } } pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size; if ((p && pkt_size > p->dev->mtu) || pkt_size > brmctx->br->dev->mtu) return NULL; skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IP); skb_reset_mac_header(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); ip_eth_mc_map(ip_dst, eth->h_dest); eth->h_proto = htons(ETH_P_IP); skb_put(skb, sizeof(*eth)); skb_set_network_header(skb, skb->len); iph = ip_hdr(skb); iph->tot_len = htons(pkt_size - sizeof(*eth)); iph->version = 4; iph->ihl = 6; iph->tos = 0xc0; iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = ip_dst; ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; ip_send_check(iph); skb_put(skb, 24); skb_set_transport_header(skb, skb->len); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; switch (brmctx->multicast_igmp_version) { case 2: ih = igmp_hdr(skb); ih->type = IGMP_HOST_MEMBERSHIP_QUERY; ih->code = (group ? brmctx->multicast_last_member_interval : brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ih->group = group; ih->csum = 0; csum = &ih->csum; csum_start = (void *)ih; break; case 3: ihv3 = igmpv3_query_hdr(skb); ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; ihv3->code = (group ? brmctx->multicast_last_member_interval : brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ihv3->group = group; ihv3->qqic = brmctx->multicast_query_interval / HZ; ihv3->nsrcs = htons(lmqt_srcs); ihv3->resv = 0; ihv3->suppress = sflag; ihv3->qrv = 2; ihv3->csum = 0; csum = &ihv3->csum; csum_start = (void *)ihv3; if (!pg || !with_srcs) break; lmqt_srcs = 0; hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && ent->src_query_rexmit_cnt > 0) { ihv3->srcs[lmqt_srcs++] = ent->addr.src.ip4; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; } } if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) { kfree_skb(skb); return NULL; } break; } if (WARN_ON(!csum || !csum_start)) { kfree_skb(skb); return NULL; } *csum = ip_compute_csum(csum_start, igmp_hdr_size); skb_put(skb, igmp_hdr_size); __skb_pull(skb, sizeof(*eth)); out: return skb; } #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, const struct in6_addr *ip6_dst, const struct in6_addr *group, bool with_srcs, bool over_llqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, mld_hdr_size; unsigned long now = jiffies; struct mld2_query *mld2q; void *csum_start = NULL; unsigned long interval; __sum16 *csum = NULL; struct ipv6hdr *ip6h; struct mld_msg *mldq; struct sk_buff *skb; unsigned long llqt; struct ethhdr *eth; u16 llqt_srcs = 0; u8 *hopopt; mld_hdr_size = sizeof(*mldq); if (brmctx->multicast_mld_version == 2) { mld_hdr_size = sizeof(*mld2q); if (pg && with_srcs) { llqt = now + (brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && ent->src_query_rexmit_cnt > 0) llqt_srcs++; } if (!llqt_srcs) return NULL; mld_hdr_size += llqt_srcs * sizeof(struct in6_addr); } } pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size; if ((p && pkt_size > p->dev->mtu) || pkt_size > brmctx->br->dev->mtu) return NULL; skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IPV6); /* Ethernet header */ skb_reset_mac_header(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); /* IPv6 header + HbH option */ skb_set_network_header(skb, skb->len); ip6h = ipv6_hdr(skb); *(__force __be32 *)ip6h = htonl(0x60000000); ip6h->payload_len = htons(8 + mld_hdr_size); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ip6h->daddr = *ip6_dst; if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev, &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false); return NULL; } br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); hopopt[0] = IPPROTO_ICMPV6; /* next hdr */ hopopt[1] = 0; /* length of HbH */ hopopt[2] = IPV6_TLV_ROUTERALERT; /* Router Alert */ hopopt[3] = 2; /* Length of RA Option */ hopopt[4] = 0; /* Type = 0x0000 (MLD) */ hopopt[5] = 0; hopopt[6] = IPV6_TLV_PAD1; /* Pad1 */ hopopt[7] = IPV6_TLV_PAD1; /* Pad1 */ skb_put(skb, sizeof(*ip6h) + 8); /* ICMPv6 */ skb_set_transport_header(skb, skb->len); interval = ipv6_addr_any(group) ? brmctx->multicast_query_response_interval : brmctx->multicast_last_member_interval; *igmp_type = ICMPV6_MGM_QUERY; switch (brmctx->multicast_mld_version) { case 1: mldq = (struct mld_msg *)icmp6_hdr(skb); mldq->mld_type = ICMPV6_MGM_QUERY; mldq->mld_code = 0; mldq->mld_cksum = 0; mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); mldq->mld_reserved = 0; mldq->mld_mca = *group; csum = &mldq->mld_cksum; csum_start = (void *)mldq; break; case 2: mld2q = (struct mld2_query *)icmp6_hdr(skb); mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval)); mld2q->mld2q_type = ICMPV6_MGM_QUERY; mld2q->mld2q_code = 0; mld2q->mld2q_cksum = 0; mld2q->mld2q_resv1 = 0; mld2q->mld2q_resv2 = 0; mld2q->mld2q_suppress = sflag; mld2q->mld2q_qrv = 2; mld2q->mld2q_nsrcs = htons(llqt_srcs); mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ; mld2q->mld2q_mca = *group; csum = &mld2q->mld2q_cksum; csum_start = (void *)mld2q; if (!pg || !with_srcs) break; llqt_srcs = 0; hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && ent->src_query_rexmit_cnt > 0) { mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.src.ip6; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; } } if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) { kfree_skb(skb); return NULL; } break; } if (WARN_ON(!csum || !csum_start)) { kfree_skb(skb); return NULL; } *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size, IPPROTO_ICMPV6, csum_partial(csum_start, mld_hdr_size, 0)); skb_put(skb, mld_hdr_size); __skb_pull(skb, sizeof(*eth)); out: return skb; } #endif static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, bool with_srcs, bool over_lmqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { __be32 ip4_dst; switch (group->proto) { case htons(ETH_P_IP): ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP); return br_ip4_multicast_alloc_query(brmctx, pmctx, pg, ip4_dst, group->dst.ip4, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr ip6_dst; if (ip_dst) ip6_dst = ip_dst->dst.ip6; else ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, htonl(1)); return br_ip6_multicast_alloc_query(brmctx, pmctx, pg, &ip6_dst, &group->dst.ip6, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); } #endif } return NULL; } struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, struct br_ip *group) { struct net_bridge_mdb_entry *mp; int err; mp = br_mdb_ip_get(br, group); if (mp) return mp; if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { trace_br_mdb_full(br->dev, group); br_mc_disabled_update(br->dev, false, NULL); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); return ERR_PTR(-E2BIG); } mp = kzalloc(sizeof(*mp), GFP_ATOMIC); if (unlikely(!mp)) return ERR_PTR(-ENOMEM); mp->br = br; mp->addr = *group; mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry; timer_setup(&mp->timer, br_multicast_group_expired, 0); err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode, br_mdb_rht_params); if (err) { kfree(mp); mp = ERR_PTR(err); } else { hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list); } return mp; } static void br_multicast_group_src_expired(struct timer_list *t) { struct net_bridge_group_src *src = timer_container_of(src, t, timer); struct net_bridge_port_group *pg; struct net_bridge *br = src->br; spin_lock(&br->multicast_lock); if (hlist_unhashed(&src->node) || !netif_running(br->dev) || timer_pending(&src->timer)) goto out; pg = src->pg; if (pg->filter_mode == MCAST_INCLUDE) { br_multicast_del_group_src(src, false); if (!hlist_empty(&pg->src_list)) goto out; br_multicast_find_del_pg(br, pg); } else { br_multicast_fwd_src_handle(src); } out: spin_unlock(&br->multicast_lock); } struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) { struct net_bridge_group_src *ent; switch (ip->proto) { case htons(ETH_P_IP): hlist_for_each_entry(ent, &pg->src_list, node) if (ip->src.ip4 == ent->addr.src.ip4) return ent; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): hlist_for_each_entry(ent, &pg->src_list, node) if (!ipv6_addr_cmp(&ent->addr.src.ip6, &ip->src.ip6)) return ent; break; #endif } return NULL; } struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip) { struct net_bridge_group_src *grp_src; if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT)) return NULL; switch (src_ip->proto) { case htons(ETH_P_IP): if (ipv4_is_zeronet(src_ip->src.ip4) || ipv4_is_multicast(src_ip->src.ip4)) return NULL; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (ipv6_addr_any(&src_ip->src.ip6) || ipv6_addr_is_multicast(&src_ip->src.ip6)) return NULL; break; #endif } grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC); if (unlikely(!grp_src)) return NULL; grp_src->pg = pg; grp_src->br = pg->key.port->br; grp_src->addr = *src_ip; grp_src->mcast_gc.destroy = br_multicast_destroy_group_src; timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0); hlist_add_head_rcu(&grp_src->node, &pg->src_list); pg->src_ents++; return grp_src; } struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, u8 filter_mode, u8 rt_protocol, struct netlink_ext_ack *extack) { struct net_bridge_port_group *p; int err; err = br_multicast_port_ngroups_inc(port, group, extack); if (err) return NULL; p = kzalloc(sizeof(*p), GFP_ATOMIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); goto dec_out; } p->key.addr = *group; p->key.port = port; p->flags = flags; p->filter_mode = filter_mode; p->rt_protocol = rt_protocol; p->eht_host_tree = RB_ROOT; p->eht_set_tree = RB_ROOT; p->mcast_gc.destroy = br_multicast_destroy_port_group; INIT_HLIST_HEAD(&p->src_list); if (!br_multicast_is_star_g(group) && rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group"); goto free_out; } rcu_assign_pointer(p->next, next); timer_setup(&p->timer, br_multicast_port_group_expired, 0); timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0); hlist_add_head(&p->mglist, &port->mglist); if (src) memcpy(p->eth_addr, src, ETH_ALEN); else eth_broadcast_addr(p->eth_addr); return p; free_out: kfree(p); dec_out: br_multicast_port_ngroups_dec(port, group->vid); return NULL; } void br_multicast_del_port_group(struct net_bridge_port_group *p) { struct net_bridge_port *port = p->key.port; __u16 vid = p->key.addr.vid; hlist_del_init(&p->mglist); if (!br_multicast_is_star_g(&p->key.addr)) rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params); kfree(p); br_multicast_port_ngroups_dec(port, vid); } void br_multicast_host_join(const struct net_bridge_mcast *brmctx, struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) { mp->host_joined = true; if (br_multicast_is_star_g(&mp->addr)) br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB); } if (br_group_is_l2(&mp->addr)) return; mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval); } void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) return; mp->host_joined = false; if (br_multicast_is_star_g(&mp->addr)) br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB); } static struct net_bridge_port_group * __br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1, bool blocked) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p = NULL; struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; mp = br_multicast_new_group(brmctx->br, group); if (IS_ERR(mp)) return ERR_CAST(mp); if (!pmctx) { br_multicast_host_join(brmctx, mp, true); goto out; } for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (br_port_group_equal(p, pmctx->port, src)) goto found; if ((unsigned long)p->key.port < (unsigned long)pmctx->port) break; } p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, filter_mode, RTPROT_KERNEL, NULL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); goto out; } rcu_assign_pointer(*pp, p); if (blocked) p->flags |= MDB_PG_FLAGS_BLOCKED; br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB); found: if (igmpv2_mldv1) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); out: return p; } static int br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1) { struct net_bridge_port_group *pg; int err; spin_lock(&brmctx->br->multicast_lock); pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode, igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = PTR_ERR_OR_ZERO(pg); spin_unlock(&brmctx->br->multicast_lock); return err; } static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src, bool igmpv2) { struct br_ip br_group; u8 filter_mode; if (ipv4_is_local_multicast(group)) return 0; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; return br_multicast_add_group(brmctx, pmctx, &br_group, src, filter_mode, igmpv2); } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src, bool mldv1) { struct br_ip br_group; u8 filter_mode; if (ipv6_addr_is_ll_all_nodes(group)) return 0; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; return br_multicast_add_group(brmctx, pmctx, &br_group, src, filter_mode, mldv1); } #endif static bool br_multicast_rport_del(struct hlist_node *rlist) { if (hlist_unhashed(rlist)) return false; hlist_del_init_rcu(rlist); return true; } static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { return br_multicast_rport_del(&pmctx->ip4_rlist); } static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) return br_multicast_rport_del(&pmctx->ip6_rlist); #else return false; #endif } static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx, struct timer_list *t, struct hlist_node *rlist) { struct net_bridge *br = pmctx->port->br; bool del; spin_lock(&br->multicast_lock); if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || pmctx->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(t)) goto out; del = br_multicast_rport_del(rlist); br_multicast_rport_del_notify(pmctx, del); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_router_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, ip4_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_router_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, ip6_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist); } #endif static void br_mc_router_state_change(struct net_bridge *p, bool is_mc_router) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, .flags = SWITCHDEV_F_DEFER, .u.mrouter = is_mc_router, }; switchdev_port_attr_set(p->dev, &attr, NULL); } static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx, struct timer_list *timer) { spin_lock(&brmctx->br->multicast_lock); if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED || brmctx->multicast_router == MDB_RTR_TYPE_PERM || br_ip4_multicast_is_router(brmctx) || br_ip6_multicast_is_router(brmctx)) goto out; br_mc_router_state_change(brmctx->br, false); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_local_router_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip4_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_local_router_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip6_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } #endif static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { spin_lock(&brmctx->br->multicast_lock); if (!netif_running(brmctx->br->dev) || br_multicast_ctx_vlan_global_disabled(brmctx) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) goto out; br_multicast_start_querier(brmctx, query); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_querier_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip4_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_querier_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip6_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query); } #endif static void br_multicast_query_delay_expired(struct timer_list *t) { } static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx, struct br_ip *ip, struct sk_buff *skb) { if (ip->proto == htons(ETH_P_IP)) brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; #if IS_ENABLED(CONFIG_IPV6) else brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; #endif } static void __br_multicast_send_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, bool with_srcs, u8 sflag, bool *need_rexmit) { bool over_lmqt = !!sflag; struct sk_buff *skb; u8 igmp_type; if (!br_multicast_ctx_should_use(brmctx, pmctx) || !br_multicast_ctx_matches_vlan_snooping(brmctx)) return; again_under_lmqt: skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group, with_srcs, over_lmqt, sflag, &igmp_type, need_rexmit); if (!skb) return; if (pmctx) { skb->dev = pmctx->port->dev; br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type, BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); if (over_lmqt && with_srcs && sflag) { over_lmqt = false; goto again_under_lmqt; } } else { br_multicast_select_own_querier(brmctx, group, skb); br_multicast_count(brmctx->br, NULL, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); } } static void br_multicast_read_querier(const struct bridge_mcast_querier *querier, struct bridge_mcast_querier *dest) { unsigned int seq; memset(dest, 0, sizeof(*dest)); do { seq = read_seqcount_begin(&querier->seq); dest->port_ifidx = querier->port_ifidx; memcpy(&dest->addr, &querier->addr, sizeof(struct br_ip)); } while (read_seqcount_retry(&querier->seq, seq)); } static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_querier *querier, int ifindex, struct br_ip *saddr) { write_seqcount_begin(&querier->seq); querier->port_ifidx = ifindex; memcpy(&querier->addr, saddr, sizeof(*saddr)); write_seqcount_end(&querier->seq); } static void br_multicast_send_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *own_query) { struct bridge_mcast_other_query *other_query = NULL; struct bridge_mcast_querier *querier; struct br_ip br_group; unsigned long time; if (!br_multicast_ctx_should_use(brmctx, pmctx) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || !brmctx->multicast_querier) return; memset(&br_group.dst, 0, sizeof(br_group.dst)); if (pmctx ? (own_query == &pmctx->ip4_own_query) : (own_query == &brmctx->ip4_own_query)) { querier = &brmctx->ip4_querier; other_query = &brmctx->ip4_other_query; br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) } else { querier = &brmctx->ip6_querier; other_query = &brmctx->ip6_other_query; br_group.proto = htons(ETH_P_IPV6); #endif } if (!other_query || timer_pending(&other_query->timer)) return; /* we're about to select ourselves as querier */ if (!pmctx && querier->port_ifidx) { struct br_ip zeroip = {}; br_multicast_update_querier(brmctx, querier, 0, &zeroip); } __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false, 0, NULL); time = jiffies; time += own_query->startup_sent < brmctx->multicast_startup_query_count ? brmctx->multicast_startup_query_interval : brmctx->multicast_query_interval; mod_timer(&own_query->timer, time); } static void br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *query) { struct net_bridge *br = pmctx->port->br; struct net_bridge_mcast *brmctx; spin_lock(&br->multicast_lock); if (br_multicast_port_ctx_state_stopped(pmctx)) goto out; brmctx = br_multicast_port_ctx_get_global(pmctx); if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; br_multicast_send_query(brmctx, pmctx, query); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_port_query_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, ip4_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_port_query_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, ip6_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query); } #endif static void br_multicast_port_group_rexmit(struct timer_list *t) { struct net_bridge_port_group *pg = timer_container_of(pg, t, rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; struct net_bridge *br = pg->key.port->br; struct net_bridge_mcast_port *pmctx; struct net_bridge_mcast *brmctx; bool need_rexmit = false; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) goto out; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) goto out; brmctx = br_multicast_port_ctx_get_global(pmctx); if (!brmctx->multicast_querier) goto out; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif if (!other_query || timer_pending(&other_query->timer)) goto out; if (pg->grp_query_rexmit_cnt) { pg->grp_query_rexmit_cnt--; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 1, NULL); } __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 0, &need_rexmit); if (pg->grp_query_rexmit_cnt || need_rexmit) mod_timer(&pg->rexmit_timer, jiffies + brmctx->multicast_last_member_interval); out: spin_unlock(&br->multicast_lock); } static int br_mc_disabled_update(struct net_device *dev, bool value, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, .flags = SWITCHDEV_F_DEFER, .u.mc_disabled = !value, }; return switchdev_port_attr_set(dev, &attr, extack); } void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx) { pmctx->port = port; pmctx->vlan = vlan; pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; timer_setup(&pmctx->ip4_mc_router_timer, br_ip4_multicast_router_expired, 0); timer_setup(&pmctx->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) timer_setup(&pmctx->ip6_mc_router_timer, br_ip6_multicast_router_expired, 0); timer_setup(&pmctx->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif } void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { struct net_bridge *br = pmctx->port->br; bool del = false; #if IS_ENABLED(CONFIG_IPV6) timer_delete_sync(&pmctx->ip6_mc_router_timer); #endif timer_delete_sync(&pmctx->ip4_mc_router_timer); spin_lock_bh(&br->multicast_lock); del |= br_ip6_multicast_rport_del(pmctx); del |= br_ip4_multicast_rport_del(pmctx); br_multicast_rport_del_notify(pmctx, del); spin_unlock_bh(&br->multicast_lock); } int br_multicast_add_port(struct net_bridge_port *port) { int err; port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx); err = br_mc_disabled_update(port->dev, br_opt_get(port->br, BROPT_MULTICAST_ENABLED), NULL); if (err && err != -EOPNOTSUPP) return err; port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!port->mcast_stats) return -ENOMEM; return 0; } void br_multicast_del_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; /* Take care of the remaining groups, only perm ones should be left */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) br_multicast_find_del_pg(br, pg); spin_unlock_bh(&br->multicast_lock); flush_work(&br->mcast_gc_work); br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } static void br_multicast_enable(struct bridge_mcast_own_query *query) { query->startup_sent = 0; if (timer_delete_sync_try(&query->timer) >= 0 || timer_delete(&query->timer)) mod_timer(&query->timer, jiffies); } static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge *br = pmctx->port->br; struct net_bridge_mcast *brmctx; brmctx = br_multicast_port_ctx_get_global(pmctx); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev)) return; br_multicast_enable(&pmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&pmctx->ip6_own_query); #endif if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) { br_ip4_multicast_add_router(brmctx, pmctx); br_ip6_multicast_add_router(brmctx, pmctx); } if (br_multicast_port_ctx_is_vlan(pmctx)) { struct net_bridge_port_group *pg; u32 n = 0; /* The mcast_n_groups counter might be wrong. First, * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries * are flushed, thus mcast_n_groups after the toggle does not * reflect the true values. And second, permanent entries added * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected * either. Thus we have to refresh the counter. */ hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) { if (pg->key.addr.vid == pmctx->vlan->vid) n++; } WRITE_ONCE(pmctx->mdb_n_entries, n); } } static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge *br = pmctx->port->br; spin_lock_bh(&br->multicast_lock); if (br_multicast_port_ctx_is_vlan(pmctx) && !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { spin_unlock_bh(&br->multicast_lock); return; } __br_multicast_enable_port_ctx(pmctx); spin_unlock_bh(&br->multicast_lock); } static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge_port_group *pg; struct hlist_node *n; bool del = false; hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist) if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) && (!br_multicast_port_ctx_is_vlan(pmctx) || pg->key.addr.vid == pmctx->vlan->vid)) br_multicast_find_del_pg(pmctx->port->br, pg); del |= br_ip4_multicast_rport_del(pmctx); timer_delete(&pmctx->ip4_mc_router_timer); timer_delete(&pmctx->ip4_own_query.timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) timer_delete(&pmctx->ip6_mc_router_timer); timer_delete(&pmctx->ip6_own_query.timer); #endif br_multicast_rport_del_notify(pmctx, del); } static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge *br = pmctx->port->br; spin_lock_bh(&br->multicast_lock); if (br_multicast_port_ctx_is_vlan(pmctx) && !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { spin_unlock_bh(&br->multicast_lock); return; } __br_multicast_disable_port_ctx(pmctx); spin_unlock_bh(&br->multicast_lock); } static void br_multicast_toggle_port(struct net_bridge_port *port, bool on) { #if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; rcu_read_lock(); vg = nbp_vlan_group_rcu(port); if (!vg) { rcu_read_unlock(); return; } /* iterate each vlan, toggle vlan multicast context */ list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) { struct net_bridge_mcast_port *pmctx = &vlan->port_mcast_ctx; u8 state = br_vlan_get_state(vlan); /* enable vlan multicast context when state is * LEARNING or FORWARDING */ if (on && br_vlan_state_allowed(state, true)) br_multicast_enable_port_ctx(pmctx); else br_multicast_disable_port_ctx(pmctx); } rcu_read_unlock(); return; } #endif /* toggle port multicast context when vlan snooping is disabled */ if (on) br_multicast_enable_port_ctx(&port->multicast_ctx); else br_multicast_disable_port_ctx(&port->multicast_ctx); } void br_multicast_enable_port(struct net_bridge_port *port) { br_multicast_toggle_port(port, true); } void br_multicast_disable_port(struct net_bridge_port *port) { br_multicast_toggle_port(port, false); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) { struct net_bridge_group_src *ent; struct hlist_node *tmp; int deleted = 0; hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) if (ent->flags & BR_SGRP_F_DELETE) { br_multicast_del_group_src(ent, false); deleted++; } return deleted; } static void __grp_src_mod_timer(struct net_bridge_group_src *src, unsigned long expires) { mod_timer(&src->timer, expires); br_multicast_fwd_src_handle(src); } static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; u32 lmqc = brmctx->multicast_last_member_count; unsigned long lmqt, lmi, now = jiffies; struct net_bridge_group_src *ent; if (!netif_running(brmctx->br->dev) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif lmqt = now + br_multicast_lmqt(brmctx); hlist_for_each_entry(ent, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_SEND) { ent->flags &= ~BR_SGRP_F_SEND; if (ent->timer.expires > lmqt) { if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) ent->src_query_rexmit_cnt = lmqc; __grp_src_mod_timer(ent, lmqt); } } } if (!brmctx->multicast_querier || !other_query || timer_pending(&other_query->timer)) return; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 1, NULL); lmi = now + brmctx->multicast_last_member_interval; if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; unsigned long now = jiffies, lmi; if (!netif_running(brmctx->br->dev) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) { lmi = now + brmctx->multicast_last_member_interval; pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 0, NULL); if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } if (pg->filter_mode == MCAST_EXCLUDE && (!timer_pending(&pg->timer) || time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx)))) mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx)); } /* State Msg type New state Actions * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI */ static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; u32 src_idx; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; return changed; } /* State Msg type New state Actions * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 * Delete (A-B) * Group Timer=GMI */ static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; struct br_ip src_ip; u32 src_idx; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) ent->flags &= ~BR_SGRP_F_DELETE; else ent = br_multicast_new_group_src(pg, &src_ip); if (ent) br_multicast_fwd_src_handle(ent); } br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); __grp_src_delete_marked(pg); } /* State Msg type New state Actions * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI * Delete (X-A) * Delete (Y-A) * Group Timer=GMI */ static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; u32 src_idx; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); changed = true; } } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; return changed; } static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } /* State Msg type New state Actions * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI * Send Q(G,A-B) */ static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_SEND; to_send--; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } /* State Msg type New state Actions * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI * Send Q(G,X-A) * Send Q(G) */ static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) if (timer_pending(&ent->timer)) ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { if (timer_pending(&ent->timer)) { ent->flags &= ~BR_SGRP_F_SEND; to_send--; } } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); __grp_send_query_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_toin(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } if (br_multicast_eht_should_del_pg(pg)) { pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_find_del_pg(pg->key.port->br, pg); /* a notification has already been sent and we shouldn't * access pg after the delete so we have to return false */ changed = false; } return changed; } /* State Msg type New state Actions * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 * Delete (A-B) * Send Q(G,A*B) * Group Timer=GMI */ static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) | BR_SGRP_F_SEND; to_send++; } else { ent = br_multicast_new_group_src(pg, &src_ip); } if (ent) br_multicast_fwd_src_handle(ent); } br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); __grp_src_delete_marked(pg); if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); } /* State Msg type New state Actions * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer * Delete (X-A) * Delete (Y-A) * Send Q(G,A-Y) * Group Timer=GMI */ static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } if (ent && timer_pending(&ent->timer)) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_toex(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } /* State Msg type New state Actions * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) */ static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } /* State Msg type New state Actions * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer * Send Q(G,A-Y) */ static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } if (ent && timer_pending(&ent->timer)) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_block(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } if ((pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) || br_multicast_eht_should_del_pg(pg)) { if (br_multicast_eht_should_del_pg(pg)) pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_find_del_pg(pg->key.port->br, pg); /* a notification has already been sent and we shouldn't * access pg after the delete so we have to return false */ changed = false; } return changed; } static struct net_bridge_port_group * br_multicast_find_port(struct net_bridge_mdb_entry *mp, struct net_bridge_port *p, const unsigned char *src) { struct net_bridge *br __maybe_unused = mp->br; struct net_bridge_port_group *pg; for (pg = mlock_dereference(mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) if (br_port_group_equal(pg, p, src)) return pg; return NULL; } static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { bool igmpv2 = brmctx->multicast_igmp_version == 2; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; const unsigned char *src; struct igmpv3_report *ih; struct igmpv3_grec *grec; int i, len, num, type; __be32 group, *h_addr; bool changed = false; int err = 0; u16 nsrcs; ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); if (!ip_mc_may_pull(skb, len)) return -EINVAL; grec = (void *)(skb->data + len - sizeof(*grec)); group = grec->grec_mca; type = grec->grec_type; nsrcs = ntohs(grec->grec_nsrcs); len += nsrcs * 4; if (!ip_mc_may_pull(skb, len)) return -EINVAL; switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: case IGMPV3_CHANGE_TO_INCLUDE: case IGMPV3_CHANGE_TO_EXCLUDE: case IGMPV3_ALLOW_NEW_SOURCES: case IGMPV3_BLOCK_OLD_SOURCES: break; default: continue; } src = eth_hdr(skb)->h_source; if (nsrcs == 0 && (type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE)) { if (!pmctx || igmpv2) { br_ip4_multicast_leave_group(brmctx, pmctx, group, vid, src); continue; } } else { err = br_ip4_multicast_add_group(brmctx, pmctx, group, vid, src, igmpv2); if (err) break; } if (!pmctx || igmpv2) continue; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; mdst = br_mdb_ip4_get(brmctx->br, group, vid); if (!mdst) goto unlock_continue; pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; /* reload grec and host addr */ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4)); h_addr = &ip_hdr(skb)->saddr; switch (type) { case IGMPV3_ALLOW_NEW_SOURCES: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_INCLUDE: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_EXCLUDE: changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_INCLUDE: changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_EXCLUDE: changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_BLOCK_OLD_SOURCES: changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; } if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: spin_unlock(&brmctx->br->multicast_lock); } return err; } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { bool mldv1 = brmctx->multicast_mld_version == 1; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; unsigned int nsrcs_offset; struct mld2_report *mld2r; const unsigned char *src; struct in6_addr *h_addr; struct mld2_grec *grec; unsigned int grec_len; bool changed = false; int i, len, num; int err = 0; if (!ipv6_mc_may_pull(skb, sizeof(*mld2r))) return -EINVAL; mld2r = (struct mld2_report *)icmp6_hdr(skb); num = ntohs(mld2r->mld2r_ngrec); len = skb_transport_offset(skb) + sizeof(*mld2r); for (i = 0; i < num; i++) { __be16 *_nsrcs, __nsrcs; u16 nsrcs; nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs); if (skb_transport_offset(skb) + ipv6_transport_len(skb) < nsrcs_offset + sizeof(__nsrcs)) return -EINVAL; _nsrcs = skb_header_pointer(skb, nsrcs_offset, sizeof(__nsrcs), &__nsrcs); if (!_nsrcs) return -EINVAL; nsrcs = ntohs(*_nsrcs); grec_len = struct_size(grec, grec_src, nsrcs); if (!ipv6_mc_may_pull(skb, len + grec_len)) return -EINVAL; grec = (struct mld2_grec *)(skb->data + len); len += grec_len; switch (grec->grec_type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: case MLD2_CHANGE_TO_INCLUDE: case MLD2_CHANGE_TO_EXCLUDE: case MLD2_ALLOW_NEW_SOURCES: case MLD2_BLOCK_OLD_SOURCES: break; default: continue; } src = eth_hdr(skb)->h_source; if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && nsrcs == 0) { if (!pmctx || mldv1) { br_ip6_multicast_leave_group(brmctx, pmctx, &grec->grec_mca, vid, src); continue; } } else { err = br_ip6_multicast_add_group(brmctx, pmctx, &grec->grec_mca, vid, src, mldv1); if (err) break; } if (!pmctx || mldv1) continue; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid); if (!mdst) goto unlock_continue; pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; h_addr = &ipv6_hdr(skb)->saddr; switch (grec->grec_type) { case MLD2_ALLOW_NEW_SOURCES: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_INCLUDE: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_EXCLUDE: changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_INCLUDE: changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_EXCLUDE: changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_BLOCK_OLD_SOURCES: changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; } if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: spin_unlock(&brmctx->br->multicast_lock); } return err; } #endif static bool br_multicast_select_querier(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *saddr) { int port_ifidx = pmctx ? pmctx->port->dev->ifindex : 0; struct timer_list *own_timer, *other_timer; struct bridge_mcast_querier *querier; switch (saddr->proto) { case htons(ETH_P_IP): querier = &brmctx->ip4_querier; own_timer = &brmctx->ip4_own_query.timer; other_timer = &brmctx->ip4_other_query.timer; if (!querier->addr.src.ip4 || ntohl(saddr->src.ip4) <= ntohl(querier->addr.src.ip4)) goto update; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): querier = &brmctx->ip6_querier; own_timer = &brmctx->ip6_own_query.timer; other_timer = &brmctx->ip6_other_query.timer; if (ipv6_addr_cmp(&saddr->src.ip6, &querier->addr.src.ip6) <= 0) goto update; break; #endif default: return false; } if (!timer_pending(own_timer) && !timer_pending(other_timer)) goto update; return false; update: br_multicast_update_querier(brmctx, querier, port_ifidx, saddr); return true; } static struct net_bridge_port * __br_multicast_get_querier_port(struct net_bridge *br, const struct bridge_mcast_querier *querier) { int port_ifidx = READ_ONCE(querier->port_ifidx); struct net_bridge_port *p; struct net_device *dev; if (port_ifidx == 0) return NULL; dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx); if (!dev) return NULL; p = br_port_get_rtnl_rcu(dev); if (!p || p->br != br) return NULL; return p; } size_t br_multicast_querier_state_size(void) { return nla_total_size(0) + /* nest attribute */ nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IP_PORT */ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IP_OTHER_TIMER */ #if IS_ENABLED(CONFIG_IPV6) nla_total_size(sizeof(struct in6_addr)) + /* BRIDGE_QUERIER_IPV6_ADDRESS */ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IPV6_PORT */ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IPV6_OTHER_TIMER */ #endif 0; } /* protected by rtnl or rcu */ int br_multicast_dump_querier_state(struct sk_buff *skb, const struct net_bridge_mcast *brmctx, int nest_attr) { struct bridge_mcast_querier querier = {}; struct net_bridge_port *p; struct nlattr *nest; if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || br_multicast_ctx_vlan_global_disabled(brmctx)) return 0; nest = nla_nest_start(skb, nest_attr); if (!nest) return -EMSGSIZE; rcu_read_lock(); if (!brmctx->multicast_querier && !timer_pending(&brmctx->ip4_other_query.timer)) goto out_v6; br_multicast_read_querier(&brmctx->ip4_querier, &querier); if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS, querier.addr.src.ip4)) { rcu_read_unlock(); goto out_err; } p = __br_multicast_get_querier_port(brmctx->br, &querier); if (timer_pending(&brmctx->ip4_other_query.timer) && (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER, br_timer_value(&brmctx->ip4_other_query.timer), BRIDGE_QUERIER_PAD) || (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) { rcu_read_unlock(); goto out_err; } out_v6: #if IS_ENABLED(CONFIG_IPV6) if (!brmctx->multicast_querier && !timer_pending(&brmctx->ip6_other_query.timer)) goto out; br_multicast_read_querier(&brmctx->ip6_querier, &querier); if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS, &querier.addr.src.ip6)) { rcu_read_unlock(); goto out_err; } p = __br_multicast_get_querier_port(brmctx->br, &querier); if (timer_pending(&brmctx->ip6_other_query.timer) && (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER, br_timer_value(&brmctx->ip6_other_query.timer), BRIDGE_QUERIER_PAD) || (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT, p->dev->ifindex)))) { rcu_read_unlock(); goto out_err; } out: #endif rcu_read_unlock(); nla_nest_end(skb, nest); if (!nla_len(nest)) nla_nest_cancel(skb, nest); return 0; out_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void br_multicast_update_query_timer(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *query, unsigned long max_delay) { if (!timer_pending(&query->timer)) mod_timer(&query->delay_timer, jiffies + max_delay); mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval); } static void br_port_mc_router_state_change(struct net_bridge_port *p, bool is_mc_router) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_PORT_MROUTER, .flags = SWITCHDEV_F_DEFER, .u.mrouter = is_mc_router, }; switchdev_port_attr_set(p->dev, &attr, NULL); } static struct net_bridge_port * br_multicast_rport_from_node(struct net_bridge_mcast *brmctx, struct hlist_head *mc_router_list, struct hlist_node *rlist) { struct net_bridge_mcast_port *pmctx; #if IS_ENABLED(CONFIG_IPV6) if (mc_router_list == &brmctx->ip6_mc_router_list) pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, ip6_rlist); else #endif pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, ip4_rlist); return pmctx->port; } static struct hlist_node * br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx, struct net_bridge_port *port, struct hlist_head *mc_router_list) { struct hlist_node *slot = NULL; struct net_bridge_port *p; struct hlist_node *rlist; hlist_for_each(rlist, mc_router_list) { p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist); if ((unsigned long)port >= (unsigned long)p) break; slot = rlist; } return slot; } static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx, struct hlist_node *rnode) { #if IS_ENABLED(CONFIG_IPV6) if (rnode != &pmctx->ip6_rlist) return hlist_unhashed(&pmctx->ip6_rlist); else return hlist_unhashed(&pmctx->ip4_rlist); #else return true; #endif } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct hlist_node *rlist, struct hlist_head *mc_router_list) { struct hlist_node *slot; if (!hlist_unhashed(rlist)) return; slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list); if (slot) hlist_add_behind_rcu(rlist, slot); else hlist_add_head_rcu(rlist, mc_router_list); /* For backwards compatibility for now, only notify if we * switched from no IPv4/IPv6 multicast router to a new * IPv4 or IPv6 multicast router. */ if (br_multicast_no_router_otherpf(pmctx, rlist)) { br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB); br_port_mc_router_state_change(pmctx->port, true); } } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist, &brmctx->ip4_mc_router_list); } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist, &brmctx->ip6_mc_router_list); #endif } static void br_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct timer_list *timer, struct hlist_node *rlist, struct hlist_head *mc_router_list) { unsigned long now = jiffies; if (!br_multicast_ctx_should_use(brmctx, pmctx)) return; if (!pmctx) { if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { if (!br_ip4_multicast_is_router(brmctx) && !br_ip6_multicast_is_router(brmctx)) br_mc_router_state_change(brmctx->br, true); mod_timer(timer, now + brmctx->multicast_querier_interval); } return; } if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || pmctx->multicast_router == MDB_RTR_TYPE_PERM) return; br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list); mod_timer(timer, now + brmctx->multicast_querier_interval); } static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { struct timer_list *timer = &brmctx->ip4_mc_router_timer; struct hlist_node *rlist = NULL; if (pmctx) { timer = &pmctx->ip4_mc_router_timer; rlist = &pmctx->ip4_rlist; } br_multicast_mark_router(brmctx, pmctx, timer, rlist, &brmctx->ip4_mc_router_list); } static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) struct timer_list *timer = &brmctx->ip6_mc_router_timer; struct hlist_node *rlist = NULL; if (pmctx) { timer = &pmctx->ip6_mc_router_timer; rlist = &pmctx->ip6_rlist; } br_multicast_mark_router(brmctx, pmctx, timer, rlist, &brmctx->ip6_mc_router_list); #endif } static void br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; br_multicast_update_query_timer(brmctx, query, max_delay); br_ip4_multicast_mark_router(brmctx, pmctx); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; br_multicast_update_query_timer(brmctx, query, max_delay); br_ip6_multicast_mark_router(brmctx, pmctx); } #endif static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { unsigned int transport_len = ip_transport_len(skb); const struct iphdr *iph = ip_hdr(skb); struct igmphdr *ih = igmp_hdr(skb); struct net_bridge_mdb_entry *mp; struct igmpv3_query *ih3; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; __be32 group; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; group = ih->group; if (transport_len == sizeof(*ih)) { max_delay = ih->code * (HZ / IGMP_TIMER_SCALE); if (!max_delay) { max_delay = 10 * HZ; group = 0; } } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs || (brmctx->multicast_igmp_version == 3 && group && ih3->suppress)) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; } else { goto out; } if (!group) { saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; br_ip4_multicast_query_received(brmctx, pmctx, &brmctx->ip4_other_query, &saddr, max_delay); goto out; } mp = br_mdb_ip4_get(brmctx->br, group, vid); if (!mp) goto out; max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : timer_delete_sync_try(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : timer_delete_sync_try(&p->timer) >= 0 && (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: spin_unlock(&brmctx->br->multicast_lock); } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { unsigned int transport_len = ipv6_transport_len(skb); struct mld_msg *mld; struct net_bridge_mdb_entry *mp; struct mld2_query *mld2q; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; unsigned int offset = skb_transport_offset(skb); const struct in6_addr *group = NULL; bool is_general_query; int err = 0; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; if (transport_len == sizeof(*mld)) { if (!pskb_may_pull(skb, offset + sizeof(*mld))) { err = -EINVAL; goto out; } mld = (struct mld_msg *) icmp6_hdr(skb); max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay)); if (max_delay) group = &mld->mld_mca; } else { if (!pskb_may_pull(skb, offset + sizeof(*mld2q))) { err = -EINVAL; goto out; } mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; if (brmctx->multicast_mld_version == 2 && !ipv6_addr_any(&mld2q->mld2q_mca) && mld2q->mld2q_suppress) goto out; max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL); } is_general_query = group && ipv6_addr_any(group); if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; br_ip6_multicast_query_received(brmctx, pmctx, &brmctx->ip6_other_query, &saddr, max_delay); goto out; } else if (!group) { goto out; } mp = br_mdb_ip6_get(brmctx->br, group, vid); if (!mp) goto out; max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : timer_delete_sync_try(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : timer_delete_sync_try(&p->timer) >= 0 && (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: spin_unlock(&brmctx->br->multicast_lock); return err; } #endif static void br_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, struct bridge_mcast_other_query *other_query, struct bridge_mcast_own_query *own_query, const unsigned char *src) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; unsigned long now; unsigned long time; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; mp = br_mdb_ip_get(brmctx->br, group); if (!mp) goto out; if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) { struct net_bridge_port_group __rcu **pp; for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (!br_port_group_equal(p, pmctx->port, src)) continue; if (p->flags & MDB_PG_FLAGS_PERMANENT) break; p->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_del_pg(mp, p, pp); } goto out; } if (timer_pending(&other_query->timer)) goto out; if (brmctx->multicast_querier) { __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr, false, 0, NULL); time = jiffies + brmctx->multicast_last_member_count * brmctx->multicast_last_member_interval; mod_timer(&own_query->timer, time); for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL && pmctx != NULL; p = mlock_dereference(p->next, brmctx->br)) { if (!br_port_group_equal(p, pmctx->port, src)) continue; if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : timer_delete_sync_try(&p->timer) >= 0)) { mod_timer(&p->timer, time); } break; } } now = jiffies; time = now + brmctx->multicast_last_member_count * brmctx->multicast_last_member_interval; if (!pmctx) { if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : timer_delete_sync_try(&mp->timer) >= 0)) { mod_timer(&mp->timer, time); } goto out; } for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL; p = mlock_dereference(p->next, brmctx->br)) { if (p->key.port != pmctx->port) continue; if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : timer_delete_sync_try(&p->timer) >= 0)) { mod_timer(&p->timer, time); } break; } out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; if (ipv4_is_local_multicast(group)) return; own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; br_multicast_leave_group(brmctx, pmctx, &br_group, &brmctx->ip4_other_query, own_query, src); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; if (ipv6_addr_is_ll_all_nodes(group)) return; own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; br_multicast_leave_group(brmctx, pmctx, &br_group, &brmctx->ip6_other_query, own_query, src); } #endif static void br_multicast_err_count(const struct net_bridge *br, const struct net_bridge_port *p, __be16 proto) { struct bridge_mcast_stats __percpu *stats; struct bridge_mcast_stats *pstats; if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; pstats = this_cpu_ptr(stats); u64_stats_update_begin(&pstats->syncp); switch (proto) { case htons(ETH_P_IP): pstats->mstats.igmp_parse_errors++; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): pstats->mstats.mld_parse_errors++; break; #endif } u64_stats_update_end(&pstats->syncp); } static void br_multicast_pim(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct sk_buff *skb) { unsigned int offset = skb_transport_offset(skb); struct pimhdr *pimhdr, _pimhdr; pimhdr = skb_header_pointer(skb, offset, sizeof(_pimhdr), &_pimhdr); if (!pimhdr || pim_hdr_version(pimhdr) != PIM_VERSION || pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; spin_lock(&brmctx->br->multicast_lock); br_ip4_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); } static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (ip_hdr(skb)->protocol != IPPROTO_IGMP || igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; spin_lock(&brmctx->br->multicast_lock); br_ip4_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); return 0; } static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct igmphdr *ih; int err; err = ip_mc_check_igmp(skb); if (err == -ENOMSG) { if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) { BR_INPUT_SKB_CB(skb)->mrouters_only = 1; } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) br_multicast_pim(brmctx, pmctx, skb); } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb); } return 0; } else if (err < 0) { br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } ih = igmp_hdr(skb); src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->igmp = ih->type; switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid, src, true); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: br_ip4_multicast_query(brmctx, pmctx, skb, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src); break; } br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; spin_lock(&brmctx->br->multicast_lock); br_ip6_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); } static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct mld_msg *mld; int err; err = ipv6_mc_check_mld(skb); if (err == -ENOMSG || err == -ENODATA) { if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; if (err == -ENODATA && ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb); return 0; } else if (err < 0) { br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } mld = (struct mld_msg *)skb_transport_header(skb); BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type; switch (mld->mld_type) { case ICMPV6_MGM_REPORT: src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca, vid, src, true); break; case ICMPV6_MLD2_REPORT: err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_QUERY: err = br_ip6_multicast_query(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_REDUCTION: src = eth_hdr(skb)->h_source; br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid, src); break; } br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #endif int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { int ret = 0; BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED)) return 0; if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) { const struct net_bridge_vlan *masterv; /* the vlan has the master flag set only when transmitting * through the bridge device */ if (br_vlan_is_master(vlan)) { masterv = vlan; *brmctx = &vlan->br_mcast_ctx; *pmctx = NULL; } else { masterv = vlan->brvlan; *brmctx = &vlan->brvlan->br_mcast_ctx; *pmctx = &vlan->port_mcast_ctx; } if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) return 0; } switch (skb->protocol) { case htons(ETH_P_IP): ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid); break; #endif } return ret; } static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { spin_lock(&brmctx->br->multicast_lock); if (br_multicast_ctx_vlan_disabled(brmctx)) goto out; if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; br_multicast_send_query(brmctx, NULL, query); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_query_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip4_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip6_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip6_own_query); } #endif static void br_multicast_gc_work(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, mcast_gc_work); HLIST_HEAD(deleted_head); spin_lock_bh(&br->multicast_lock); hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); } void br_multicast_ctx_init(struct net_bridge *br, struct net_bridge_vlan *vlan, struct net_bridge_mcast *brmctx) { brmctx->br = br; brmctx->vlan = vlan; brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; brmctx->multicast_last_member_count = 2; brmctx->multicast_startup_query_count = 2; brmctx->multicast_last_member_interval = HZ; brmctx->multicast_query_response_interval = 10 * HZ; brmctx->multicast_startup_query_interval = 125 * HZ / 4; brmctx->multicast_query_interval = 125 * HZ; brmctx->multicast_querier_interval = 255 * HZ; brmctx->multicast_membership_interval = 260 * HZ; brmctx->ip4_querier.port_ifidx = 0; seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock); brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) brmctx->multicast_mld_version = 1; brmctx->ip6_querier.port_ifidx = 0; seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock); #endif timer_setup(&brmctx->ip4_mc_router_timer, br_ip4_multicast_local_router_expired, 0); timer_setup(&brmctx->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); timer_setup(&brmctx->ip4_other_query.delay_timer, br_multicast_query_delay_expired, 0); timer_setup(&brmctx->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) timer_setup(&brmctx->ip6_mc_router_timer, br_ip6_multicast_local_router_expired, 0); timer_setup(&brmctx->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); timer_setup(&brmctx->ip6_other_query.delay_timer, br_multicast_query_delay_expired, 0); timer_setup(&brmctx->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif } void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) { __br_multicast_stop(brmctx); } void br_multicast_init(struct net_bridge *br) { br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; br_multicast_ctx_init(br, NULL, &br->multicast_ctx); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); spin_lock_init(&br->multicast_lock); INIT_HLIST_HEAD(&br->mdb_list); INIT_HLIST_HEAD(&br->mcast_gc_list); INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work); } static void br_ip4_multicast_join_snoopers(struct net_bridge *br) { struct in_device *in_dev = in_dev_get(br->dev); if (!in_dev) return; __ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC); in_dev_put(in_dev); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_join_snoopers(struct net_bridge *br) { struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); ipv6_dev_mc_inc(br->dev, &addr); } #else static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br) { } #endif void br_multicast_join_snoopers(struct net_bridge *br) { br_ip4_multicast_join_snoopers(br); br_ip6_multicast_join_snoopers(br); } static void br_ip4_multicast_leave_snoopers(struct net_bridge *br) { struct in_device *in_dev = in_dev_get(br->dev); if (WARN_ON(!in_dev)) return; __ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC); in_dev_put(in_dev); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_snoopers(struct net_bridge *br) { struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); ipv6_dev_mc_dec(br->dev, &addr); } #else static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br) { } #endif void br_multicast_leave_snoopers(struct net_bridge *br) { br_ip4_multicast_leave_snoopers(br); br_ip6_multicast_leave_snoopers(br); } static void __br_multicast_open_query(struct net_bridge *br, struct bridge_mcast_own_query *query) { query->startup_sent = 0; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return; mod_timer(&query->timer, jiffies); } static void __br_multicast_open(struct net_bridge_mcast *brmctx) { __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query); #endif } void br_multicast_open(struct net_bridge *br) { ASSERT_RTNL(); if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; vg = br_vlan_group(br); if (vg) { list_for_each_entry(vlan, &vg->vlan_list, vlist) { struct net_bridge_mcast *brmctx; brmctx = &vlan->br_mcast_ctx; if (br_vlan_is_brentry(vlan) && !br_multicast_ctx_vlan_disabled(brmctx)) __br_multicast_open(&vlan->br_mcast_ctx); } } } else { __br_multicast_open(&br->multicast_ctx); } } static void __br_multicast_stop(struct net_bridge_mcast *brmctx) { timer_delete_sync(&brmctx->ip4_mc_router_timer); timer_delete_sync(&brmctx->ip4_other_query.timer); timer_delete_sync(&brmctx->ip4_other_query.delay_timer); timer_delete_sync(&brmctx->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) timer_delete_sync(&brmctx->ip6_mc_router_timer); timer_delete_sync(&brmctx->ip6_other_query.timer); timer_delete_sync(&brmctx->ip6_other_query.delay_timer); timer_delete_sync(&brmctx->ip6_own_query.timer); #endif } void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state) { #if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) struct net_bridge *br; if (!br_vlan_should_use(v)) return; if (br_vlan_is_master(v)) return; br = v->port->br; if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) return; if (br_vlan_state_allowed(state, true)) br_multicast_enable_port_ctx(&v->port_mcast_ctx); /* Multicast is not disabled for the vlan when it goes in * blocking state because the timers will expire and stop by * themselves without sending more queries. */ #endif } void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge *br; /* it's okay to check for the flag without the multicast lock because it * can only change under RTNL -> multicast_lock, we need the latter to * sync with timers and packets */ if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) return; if (br_vlan_is_master(vlan)) { br = vlan->br; if (!br_vlan_is_brentry(vlan) || (on && br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx))) return; spin_lock_bh(&br->multicast_lock); vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; spin_unlock_bh(&br->multicast_lock); if (on) __br_multicast_open(&vlan->br_mcast_ctx); else __br_multicast_stop(&vlan->br_mcast_ctx); } else { struct net_bridge_mcast *brmctx; brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx); if (on && br_multicast_ctx_vlan_global_disabled(brmctx)) return; br = vlan->port->br; spin_lock_bh(&br->multicast_lock); vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; if (on) __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx); else __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx); spin_unlock_bh(&br->multicast_lock); } } static void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge_port *p; if (WARN_ON_ONCE(!br_vlan_is_master(vlan))) return; list_for_each_entry(p, &vlan->br->port_list, list) { struct net_bridge_vlan *vport; vport = br_vlan_find(nbp_vlan_group(p), vlan->vid); if (!vport) continue; br_multicast_toggle_one_vlan(vport, on); } if (br_vlan_is_brentry(vlan)) br_multicast_toggle_one_vlan(vlan, on); } int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; struct net_bridge_port *p; if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on) return 0; if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled"); return -EINVAL; } vg = br_vlan_group(br); if (!vg) return 0; br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on); /* disable/enable non-vlan mcast contexts based on vlan snooping */ if (on) __br_multicast_stop(&br->multicast_ctx); else __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) br_multicast_disable_port_ctx(&p->multicast_ctx); else br_multicast_enable_port_ctx(&p->multicast_ctx); } list_for_each_entry(vlan, &vg->vlan_list, vlist) br_multicast_toggle_vlan(vlan, on); return 0; } bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) { ASSERT_RTNL(); /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and * requires only RTNL to change */ if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) return false; vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED; br_multicast_toggle_vlan(vlan, on); return true; } void br_multicast_stop(struct net_bridge *br) { ASSERT_RTNL(); if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; vg = br_vlan_group(br); if (vg) { list_for_each_entry(vlan, &vg->vlan_list, vlist) { struct net_bridge_mcast *brmctx; brmctx = &vlan->br_mcast_ctx; if (br_vlan_is_brentry(vlan) && !br_multicast_ctx_vlan_disabled(brmctx)) __br_multicast_stop(&vlan->br_mcast_ctx); } } } else { __br_multicast_stop(&br->multicast_ctx); } } void br_multicast_dev_del(struct net_bridge *br) { struct net_bridge_mdb_entry *mp; HLIST_HEAD(deleted_head); struct hlist_node *tmp; spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) br_multicast_del_mdb_entry(mp); hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_ctx_deinit(&br->multicast_ctx); br_multicast_gc(&deleted_head); cancel_work_sync(&br->mcast_gc_work); rcu_barrier(); } int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val) { int err = -EINVAL; spin_lock_bh(&brmctx->br->multicast_lock); switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM); timer_delete(&brmctx->ip4_mc_router_timer); #if IS_ENABLED(CONFIG_IPV6) timer_delete(&brmctx->ip6_mc_router_timer); #endif brmctx->multicast_router = val; err = 0; break; case MDB_RTR_TYPE_TEMP_QUERY: if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) br_mc_router_state_change(brmctx->br, false); brmctx->multicast_router = val; err = 0; break; } spin_unlock_bh(&brmctx->br->multicast_lock); return err; } static void br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted) { if (!deleted) return; /* For backwards compatibility for now, only notify if there is * no multicast router anymore for both IPv4 and IPv6. */ if (!hlist_unhashed(&pmctx->ip4_rlist)) return; #if IS_ENABLED(CONFIG_IPV6) if (!hlist_unhashed(&pmctx->ip6_rlist)) return; #endif br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB); br_port_mc_router_state_change(pmctx->port, false); /* don't allow timer refresh */ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, unsigned long val) { struct net_bridge_mcast *brmctx; unsigned long now = jiffies; int err = -EINVAL; bool del = false; brmctx = br_multicast_port_ctx_get_global(pmctx); spin_lock_bh(&brmctx->br->multicast_lock); if (pmctx->multicast_router == val) { /* Refresh the temp router port timer */ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) { mod_timer(&pmctx->ip4_mc_router_timer, now + brmctx->multicast_querier_interval); #if IS_ENABLED(CONFIG_IPV6) mod_timer(&pmctx->ip6_mc_router_timer, now + brmctx->multicast_querier_interval); #endif } err = 0; goto unlock; } switch (val) { case MDB_RTR_TYPE_DISABLED: pmctx->multicast_router = MDB_RTR_TYPE_DISABLED; del |= br_ip4_multicast_rport_del(pmctx); timer_delete(&pmctx->ip4_mc_router_timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) timer_delete(&pmctx->ip6_mc_router_timer); #endif br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_TEMP_QUERY: pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; del |= br_ip4_multicast_rport_del(pmctx); del |= br_ip6_multicast_rport_del(pmctx); br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_PERM: pmctx->multicast_router = MDB_RTR_TYPE_PERM; timer_delete(&pmctx->ip4_mc_router_timer); br_ip4_multicast_add_router(brmctx, pmctx); #if IS_ENABLED(CONFIG_IPV6) timer_delete(&pmctx->ip6_mc_router_timer); #endif br_ip6_multicast_add_router(brmctx, pmctx); break; case MDB_RTR_TYPE_TEMP: pmctx->multicast_router = MDB_RTR_TYPE_TEMP; br_ip4_multicast_mark_router(brmctx, pmctx); br_ip6_multicast_mark_router(brmctx, pmctx); break; default: goto unlock; } err = 0; unlock: spin_unlock_bh(&brmctx->br->multicast_lock); return err; } int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router) { int err; if (br_vlan_is_master(v)) err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router); else err = br_multicast_set_port_router(&v->port_mcast_ctx, mcast_router); return err; } static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { struct net_bridge_port *port; if (!br_multicast_ctx_matches_vlan_snooping(brmctx)) return; __br_multicast_open_query(brmctx->br, query); rcu_read_lock(); list_for_each_entry_rcu(port, &brmctx->br->port_list, list) { struct bridge_mcast_own_query *ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query *ip6_own_query; #endif if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx)) continue; if (br_multicast_ctx_is_vlan(brmctx)) { struct net_bridge_vlan *vlan; vlan = br_vlan_find(nbp_vlan_group_rcu(port), brmctx->vlan->vid); if (!vlan || br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx)) continue; ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query; #endif } else { ip4_own_query = &port->multicast_ctx.ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) ip6_own_query = &port->multicast_ctx.ip6_own_query; #endif } if (query == &brmctx->ip4_own_query) br_multicast_enable(ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) else br_multicast_enable(ip6_own_query); #endif } rcu_read_unlock(); } static void br_multicast_del_grps(struct net_bridge *br) { struct net_bridge_port *port; list_for_each_entry(port, &br->port_list, list) __br_multicast_disable_port_ctx(&port->multicast_ctx); } int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_port *port; bool change_snoopers = false; int err = 0; spin_lock_bh(&br->multicast_lock); if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) goto unlock; err = br_mc_disabled_update(br->dev, val, extack); if (err == -EOPNOTSUPP) err = 0; if (err) goto unlock; br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; br_multicast_del_grps(br); goto unlock; } if (!netif_running(br->dev)) goto unlock; br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port_ctx(&port->multicast_ctx); change_snoopers = true; unlock: spin_unlock_bh(&br->multicast_lock); /* br_multicast_join_snoopers has the potential to cause * an MLD Report/Leave to be delivered to br_multicast_rcv, * which would in turn call br_multicast_add_group, which would * attempt to acquire multicast_lock. This function should be * called after the lock has been released to avoid deadlocks on * multicast_lock. * * br_multicast_leave_snoopers does not have the problem since * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and * returns without calling br_multicast_ipv4/6_rcv if it's not * enabled. Moved both functions out just for symmetry. */ if (change_snoopers) { if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) br_multicast_join_snoopers(br); else br_multicast_leave_snoopers(br); } return err; } bool br_multicast_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return !!br_opt_get(br, BROPT_MULTICAST_ENABLED); } EXPORT_SYMBOL_GPL(br_multicast_enabled); bool br_multicast_router(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); bool is_router; spin_lock_bh(&br->multicast_lock); is_router = br_multicast_is_router(&br->multicast_ctx, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } EXPORT_SYMBOL_GPL(br_multicast_router); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long max_delay; val = !!val; spin_lock_bh(&brmctx->br->multicast_lock); if (brmctx->multicast_querier == val) goto unlock; WRITE_ONCE(brmctx->multicast_querier, val); if (!val) goto unlock; max_delay = brmctx->multicast_query_response_interval; if (!timer_pending(&brmctx->ip4_other_query.timer)) mod_timer(&brmctx->ip4_other_query.delay_timer, jiffies + max_delay); br_multicast_start_querier(brmctx, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) if (!timer_pending(&brmctx->ip6_other_query.timer)) mod_timer(&brmctx->ip6_other_query.delay_timer, jiffies + max_delay); br_multicast_start_querier(brmctx, &brmctx->ip6_own_query); #endif unlock: spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val) { /* Currently we support only version 2 and 3 */ switch (val) { case 2: case 3: break; default: return -EINVAL; } spin_lock_bh(&brmctx->br->multicast_lock); brmctx->multicast_igmp_version = val; spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #if IS_ENABLED(CONFIG_IPV6) int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, unsigned long val) { /* Currently we support version 1 and 2 */ switch (val) { case 1: case 2: break; default: return -EINVAL; } spin_lock_bh(&brmctx->br->multicast_lock); brmctx->multicast_mld_version = val; spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #endif void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long intvl_jiffies = clock_t_to_jiffies(val); if (intvl_jiffies < BR_MULTICAST_QUERY_INTVL_MIN) { br_info(brmctx->br, "trying to set multicast query interval below minimum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MIN), jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MIN)); intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN; } if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) { br_info(brmctx->br, "trying to set multicast query interval above maximum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX), jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX)); intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX; } brmctx->multicast_query_interval = intvl_jiffies; } void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long intvl_jiffies = clock_t_to_jiffies(val); if (intvl_jiffies < BR_MULTICAST_STARTUP_QUERY_INTVL_MIN) { br_info(brmctx->br, "trying to set multicast startup query interval below minimum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN), jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN)); intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN; } if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) { br_info(brmctx->br, "trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX), jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX)); intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX; } brmctx->multicast_startup_query_interval = intvl_jiffies; } /** * br_multicast_list_adjacent - Returns snooped multicast addresses * @dev: The bridge port adjacent to which to retrieve addresses * @br_ip_list: The list to store found, snooped multicast IP addresses in * * Creates a list of IP addresses (struct br_ip_list) sensed by the multicast * snooping feature on all bridge ports of dev's bridge device, excluding * the addresses from dev itself. * * Returns the number of items added to br_ip_list. * * Notes: * - br_ip_list needs to be initialized by caller * - br_ip_list might contain duplicates in the end * (needs to be taken care of by caller) * - br_ip_list needs to be freed by caller */ int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) { struct net_bridge *br; struct net_bridge_port *port; struct net_bridge_port_group *group; struct br_ip_list *entry; int count = 0; rcu_read_lock(); if (!br_ip_list || !netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; list_for_each_entry_rcu(port, &br->port_list, list) { if (!port->dev || port->dev == dev) continue; hlist_for_each_entry_rcu(group, &port->mglist, mglist) { entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) goto unlock; entry->addr = group->key.addr; list_add(&entry->list, br_ip_list); count++; } } unlock: rcu_read_unlock(); return count; } EXPORT_SYMBOL_GPL(br_multicast_list_adjacent); /** * br_multicast_has_querier_anywhere - Checks for a querier on a bridge * @dev: The bridge port providing the bridge on which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a valid querier exists anywhere on the bridged link layer. * Otherwise returns false. */ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) { struct net_bridge *br; struct net_bridge_port *port; struct ethhdr eth; bool ret = false; rcu_read_lock(); if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; memset(&eth, 0, sizeof(eth)); eth.h_proto = htons(proto); ret = br_multicast_querier_exists(&br->multicast_ctx, &eth, NULL); unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); /** * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port * @dev: The bridge port adjacent to which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a selected querier is behind one of the other ports of this * bridge. Otherwise returns false. */ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) { struct net_bridge_mcast *brmctx; struct net_bridge *br; struct net_bridge_port *port; bool ret = false; int port_ifidx; rcu_read_lock(); if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; brmctx = &br->multicast_ctx; switch (proto) { case ETH_P_IP: port_ifidx = brmctx->ip4_querier.port_ifidx; if (!timer_pending(&brmctx->ip4_other_query.timer) || port_ifidx == port->dev->ifindex) goto unlock; break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: port_ifidx = brmctx->ip6_querier.port_ifidx; if (!timer_pending(&brmctx->ip6_other_query.timer) || port_ifidx == port->dev->ifindex) goto unlock; break; #endif default: goto unlock; } ret = true; unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); /** * br_multicast_has_router_adjacent - Checks for a router behind a bridge port * @dev: The bridge port adjacent to which to check for a multicast router * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a multicast router is behind one of the other ports of this * bridge. Otherwise returns false. */ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) { struct net_bridge_mcast_port *pmctx; struct net_bridge_mcast *brmctx; struct net_bridge_port *port; bool ret = false; rcu_read_lock(); port = br_port_get_check_rcu(dev); if (!port) goto unlock; brmctx = &port->br->multicast_ctx; switch (proto) { case ETH_P_IP: hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) { if (pmctx->port == port) continue; ret = true; goto unlock; } break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) { if (pmctx->port == port) continue; ret = true; goto unlock; } break; #endif default: /* when compiled without IPv6 support, be conservative and * always assume presence of an IPv6 multicast router */ ret = true; } unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent); static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats *pstats = this_cpu_ptr(stats); __be16 proto = skb->protocol; unsigned int t_len; u64_stats_update_begin(&pstats->syncp); switch (proto) { case htons(ETH_P_IP): t_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); switch (type) { case IGMP_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v1reports[dir]++; break; case IGMPV2_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v2reports[dir]++; break; case IGMPV3_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v3reports[dir]++; break; case IGMP_HOST_MEMBERSHIP_QUERY: if (t_len != sizeof(struct igmphdr)) { pstats->mstats.igmp_v3queries[dir]++; } else { unsigned int offset = skb_transport_offset(skb); struct igmphdr *ih, _ihdr; ih = skb_header_pointer(skb, offset, sizeof(_ihdr), &_ihdr); if (!ih) break; if (!ih->code) pstats->mstats.igmp_v1queries[dir]++; else pstats->mstats.igmp_v2queries[dir]++; } break; case IGMP_HOST_LEAVE_MESSAGE: pstats->mstats.igmp_leaves[dir]++; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): t_len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); t_len -= skb_network_header_len(skb); switch (type) { case ICMPV6_MGM_REPORT: pstats->mstats.mld_v1reports[dir]++; break; case ICMPV6_MLD2_REPORT: pstats->mstats.mld_v2reports[dir]++; break; case ICMPV6_MGM_QUERY: if (t_len != sizeof(struct mld_msg)) pstats->mstats.mld_v2queries[dir]++; else pstats->mstats.mld_v1queries[dir]++; break; case ICMPV6_MGM_REDUCTION: pstats->mstats.mld_leaves[dir]++; break; } break; #endif /* CONFIG_IPV6 */ } u64_stats_update_end(&pstats->syncp); } void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats __percpu *stats; /* if multicast_disabled is true then igmp type can't be set */ if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; br_mcast_stats_add(stats, skb, type, dir); } int br_multicast_init_stats(struct net_bridge *br) { br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!br->mcast_stats) return -ENOMEM; return 0; } void br_multicast_uninit_stats(struct net_bridge *br) { free_percpu(br->mcast_stats); } /* noinline for https://llvm.org/pr45802#c9 */ static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src) { dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX]; } void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest) { struct bridge_mcast_stats __percpu *stats; struct br_mcast_stats tdst; int i; memset(dest, 0, sizeof(*dest)); if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; memset(&tdst, 0, sizeof(tdst)); for_each_possible_cpu(i) { struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i); struct br_mcast_stats temp; unsigned int start; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); mcast_stats_add_dir(tdst.igmp_v3queries, temp.igmp_v3queries); mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves); mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports); mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports); mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports); tdst.igmp_parse_errors += temp.igmp_parse_errors; mcast_stats_add_dir(tdst.mld_v1queries, temp.mld_v1queries); mcast_stats_add_dir(tdst.mld_v2queries, temp.mld_v2queries); mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves); mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports); mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports); tdst.mld_parse_errors += temp.mld_parse_errors; } memcpy(dest, &tdst, sizeof(*dest)); } int br_mdb_hash_init(struct net_bridge *br) { int err; err = rhashtable_init(&br->sg_port_tbl, &br_sg_port_rht_params); if (err) return err; err = rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); if (err) { rhashtable_destroy(&br->sg_port_tbl); return err; } return 0; } void br_mdb_hash_fini(struct net_bridge *br) { rhashtable_destroy(&br->sg_port_tbl); rhashtable_destroy(&br->mdb_hash_tbl); }
39 34 1 1 4 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-only /* * Support Intel/AMD RAPL energy consumption counters * Copyright (C) 2013 Google, Inc., Stephane Eranian * * Intel RAPL interface is specified in the IA-32 Manual Vol3b * section 14.7.1 (September 2013) * * AMD RAPL interface for Fam17h is described in the public PPR: * https://bugzilla.kernel.org/show_bug.cgi?id=206537 * * RAPL provides more controls than just reporting energy consumption * however here we only expose the 3 energy consumption free running * counters (pp0, pkg, dram). * * Each of those counters increments in a power unit defined by the * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules * but it can vary. * * Counter to rapl events mappings: * * pp0 counter: consumption of all physical cores (power plane 0) * event: rapl_energy_cores * perf code: 0x1 * * pkg counter: consumption of the whole processor package * event: rapl_energy_pkg * perf code: 0x2 * * dram counter: consumption of the dram domain (servers only) * event: rapl_energy_dram * perf code: 0x3 * * gpu counter: consumption of the builtin-gpu domain (client only) * event: rapl_energy_gpu * perf code: 0x4 * * psys counter: consumption of the builtin-psys domain (client only) * event: rapl_energy_psys * perf code: 0x5 * * core counter: consumption of a single physical core * event: rapl_energy_core (power_core PMU) * perf code: 0x1 * * We manage those counters as free running (read-only). They may be * use simultaneously by other tools, such as turbostat. * * The events only support system-wide mode counting. There is no * sampling support because it does not make sense and is not * supported by the RAPL hardware. * * Because we want to avoid floating-point operations in the kernel, * the events are all reported in fixed point arithmetic (32.32). * Tools must adjust the counts to convert them to Watts using * the duration of the measurement. Tools may use a function such as * ldexp(raw_count, -32); */ #define pr_fmt(fmt) "RAPL PMU: " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/perf_event.h> #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include <asm/msr.h> #include "perf_event.h" #include "probe.h" MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters"); MODULE_LICENSE("GPL"); /* * RAPL energy status counters */ enum perf_rapl_pkg_events { PERF_RAPL_PP0 = 0, /* all cores */ PERF_RAPL_PKG, /* entire package */ PERF_RAPL_RAM, /* DRAM */ PERF_RAPL_PP1, /* gpu */ PERF_RAPL_PSYS, /* psys */ PERF_RAPL_PKG_EVENTS_MAX, NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, }; #define PERF_RAPL_CORE 0 /* single core */ #define PERF_RAPL_CORE_EVENTS_MAX 1 #define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { "pp0-core", "package", "dram", "pp1-gpu", "psys", }; static const char *const rapl_core_domain_name __initconst = "core"; /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved */ #define RAPL_EVENT_MASK 0xFFULL #define RAPL_CNTR_WIDTH 32 #define RAPL_EVENT_ATTR_STR(_name, v, str) \ static struct perf_pmu_events_attr event_attr_##v = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = str, \ }; /* * RAPL Package energy counter scope: * 1. AMD/HYGON platforms have a per-PKG package energy counter * 2. For Intel platforms * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope * 2.2. Other Intel platforms are single die systems so the scope can be * considered as either pkg-scope or die-scope, and we are considering * them as die-scope. */ #define rapl_pkg_pmu_is_pkg_scope() \ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) struct rapl_pmu { raw_spinlock_t lock; int n_active; int cpu; struct list_head active_list; struct pmu *pmu; ktime_t timer_interval; struct hrtimer hrtimer; }; struct rapl_pmus { struct pmu pmu; unsigned int nr_rapl_pmu; unsigned int cntr_mask; struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); }; enum rapl_unit_quirk { RAPL_UNIT_QUIRK_NONE, RAPL_UNIT_QUIRK_INTEL_HSW, RAPL_UNIT_QUIRK_INTEL_SPR, }; struct rapl_model { struct perf_msr *rapl_pkg_msrs; struct perf_msr *rapl_core_msrs; unsigned long pkg_events; unsigned long core_events; unsigned int msr_power_unit; enum rapl_unit_quirk unit_quirk; }; /* 1/2^hw_unit Joule */ static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; static int rapl_core_hw_unit __read_mostly; static struct rapl_pmus *rapl_pmus_pkg; static struct rapl_pmus *rapl_pmus_core; static u64 rapl_timer_ms; static struct rapl_model *rapl_model; /* * Helper function to get the correct topology id according to the * RAPL PMU scope. */ static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) { /* * Returns unsigned int, which converts the '-1' return value * (for non-existent mappings in topology map) to UINT_MAX, so * the error check in the caller is simplified. */ switch (scope) { case PERF_PMU_SCOPE_PKG: return topology_logical_package_id(cpu); case PERF_PMU_SCOPE_DIE: return topology_logical_die_id(cpu); case PERF_PMU_SCOPE_CORE: return topology_logical_core_id(cpu); default: return -EINVAL; } } static inline u64 rapl_read_counter(struct perf_event *event) { u64 raw; rdmsrq(event->hw.event_base, raw); return raw; } static inline u64 rapl_scale(u64 v, struct perf_event *event) { int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1]; if (event->pmu->scope == PERF_PMU_SCOPE_CORE) hw_unit = rapl_core_hw_unit; /* * scale delta to smallest unit (1/2^32) * users must then scale back: count * 1/(1e9*2^32) to get Joules * or use ldexp(count, -32). * Watts = Joules/Time delta */ return v << (32 - hw_unit); } static u64 rapl_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 prev_raw_count, new_raw_count; s64 delta, sdelta; int shift = RAPL_CNTR_WIDTH; prev_raw_count = local64_read(&hwc->prev_count); do { rdmsrq(event->hw.event_base, new_raw_count); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. */ delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; sdelta = rapl_scale(delta, event); local64_add(sdelta, &event->count); return new_raw_count; } static void rapl_start_hrtimer(struct rapl_pmu *pmu) { hrtimer_start(&pmu->hrtimer, pmu->timer_interval, HRTIMER_MODE_REL_PINNED); } static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) { struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); struct perf_event *event; unsigned long flags; if (!rapl_pmu->n_active) return HRTIMER_NORESTART; raw_spin_lock_irqsave(&rapl_pmu->lock, flags); list_for_each_entry(event, &rapl_pmu->active_list, active_entry) rapl_event_update(event); raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); return HRTIMER_RESTART; } static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) { struct hrtimer *hr = &rapl_pmu->hrtimer; hrtimer_setup(hr, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, struct perf_event *event) { if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; event->hw.state = 0; list_add_tail(&event->active_entry, &rapl_pmu->active_list); local64_set(&event->hw.prev_count, rapl_read_counter(event)); rapl_pmu->n_active++; if (rapl_pmu->n_active == 1) rapl_start_hrtimer(rapl_pmu); } static void rapl_pmu_event_start(struct perf_event *event, int mode) { struct rapl_pmu *rapl_pmu = event->pmu_private; unsigned long flags; raw_spin_lock_irqsave(&rapl_pmu->lock, flags); __rapl_pmu_event_start(rapl_pmu, event); raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); } static void rapl_pmu_event_stop(struct perf_event *event, int mode) { struct rapl_pmu *rapl_pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; raw_spin_lock_irqsave(&rapl_pmu->lock, flags); /* mark event as deactivated and stopped */ if (!(hwc->state & PERF_HES_STOPPED)) { WARN_ON_ONCE(rapl_pmu->n_active <= 0); rapl_pmu->n_active--; if (rapl_pmu->n_active == 0) hrtimer_cancel(&rapl_pmu->hrtimer); list_del(&event->active_entry); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } /* check if update of sw counter is necessary */ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /* * Drain the remaining delta count out of a event * that we are disabling: */ rapl_event_update(event); hwc->state |= PERF_HES_UPTODATE; } raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); } static int rapl_pmu_event_add(struct perf_event *event, int mode) { struct rapl_pmu *rapl_pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; raw_spin_lock_irqsave(&rapl_pmu->lock, flags); hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (mode & PERF_EF_START) __rapl_pmu_event_start(rapl_pmu, event); raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); return 0; } static void rapl_pmu_event_del(struct perf_event *event, int flags) { rapl_pmu_event_stop(event, PERF_EF_UPDATE); } static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; int bit, rapl_pmus_scope, ret = 0; struct rapl_pmu *rapl_pmu; unsigned int rapl_pmu_idx; struct rapl_pmus *rapl_pmus; /* only look at RAPL events */ if (event->attr.type != event->pmu->type) return -ENOENT; /* unsupported modes and filters */ if (event->attr.sample_period) /* no sampling */ return -EINVAL; /* check only supported bits are set */ if (event->attr.config & ~RAPL_EVENT_MASK) return -EINVAL; if (event->cpu < 0) return -EINVAL; rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); if (!rapl_pmus) return -EINVAL; rapl_pmus_scope = rapl_pmus->pmu.scope; if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; bit = cfg - 1; event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; bit = cfg - 1; event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; } else return -EINVAL; /* check event supported */ if (!(rapl_pmus->cntr_mask & (1 << bit))) return -EINVAL; rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope); if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) return -EINVAL; /* must be done before validate_group */ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; if (!rapl_pmu) return -EINVAL; event->pmu_private = rapl_pmu; event->hw.config = cfg; event->hw.idx = bit; return ret; } static void rapl_pmu_event_read(struct perf_event *event) { rapl_event_update(event); } RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01"); RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules"); /* * we compute in 0.23 nJ increments regardless of MSR */ RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10"); /* * There are no default events, but we need to create * "events" group (with empty attrs) before updating * it with detected events. */ static struct attribute *attrs_empty[] = { NULL, }; static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = attrs_empty, }; PMU_FORMAT_ATTR(event, "config:0-7"); static struct attribute *rapl_formats_attr[] = { &format_attr_event.attr, NULL, }; static struct attribute_group rapl_pmu_format_group = { .name = "format", .attrs = rapl_formats_attr, }; static const struct attribute_group *rapl_attr_groups[] = { &rapl_pmu_format_group, &rapl_pmu_events_group, NULL, }; static const struct attribute_group *rapl_core_attr_groups[] = { &rapl_pmu_format_group, &rapl_pmu_events_group, NULL, }; static struct attribute *rapl_events_cores[] = { EVENT_PTR(rapl_cores), EVENT_PTR(rapl_cores_unit), EVENT_PTR(rapl_cores_scale), NULL, }; static struct attribute_group rapl_events_cores_group = { .name = "events", .attrs = rapl_events_cores, }; static struct attribute *rapl_events_pkg[] = { EVENT_PTR(rapl_pkg), EVENT_PTR(rapl_pkg_unit), EVENT_PTR(rapl_pkg_scale), NULL, }; static struct attribute_group rapl_events_pkg_group = { .name = "events", .attrs = rapl_events_pkg, }; static struct attribute *rapl_events_ram[] = { EVENT_PTR(rapl_ram), EVENT_PTR(rapl_ram_unit), EVENT_PTR(rapl_ram_scale), NULL, }; static struct attribute_group rapl_events_ram_group = { .name = "events", .attrs = rapl_events_ram, }; static struct attribute *rapl_events_gpu[] = { EVENT_PTR(rapl_gpu), EVENT_PTR(rapl_gpu_unit), EVENT_PTR(rapl_gpu_scale), NULL, }; static struct attribute_group rapl_events_gpu_group = { .name = "events", .attrs = rapl_events_gpu, }; static struct attribute *rapl_events_psys[] = { EVENT_PTR(rapl_psys), EVENT_PTR(rapl_psys_unit), EVENT_PTR(rapl_psys_scale), NULL, }; static struct attribute_group rapl_events_psys_group = { .name = "events", .attrs = rapl_events_psys, }; static struct attribute *rapl_events_core[] = { EVENT_PTR(rapl_core), EVENT_PTR(rapl_core_unit), EVENT_PTR(rapl_core_scale), NULL, }; static struct attribute_group rapl_events_core_group = { .name = "events", .attrs = rapl_events_core, }; static bool test_msr(int idx, void *data) { return test_bit(idx, (unsigned long *) data); } /* Only lower 32bits of the MSR represents the energy counter */ #define RAPL_MSR_MASK 0xFFFFFFFF static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK }, }; static struct perf_msr intel_rapl_spr_msrs[] = { [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK }, }; /* * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) * - want to use same event codes across both architectures */ static struct perf_msr amd_rapl_pkg_msrs[] = { [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, NULL, false, 0 }, [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, }; static struct perf_msr amd_rapl_core_msrs[] = { [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group, test_msr, false, RAPL_MSR_MASK }, }; static int rapl_check_hw_unit(void) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrq() to handle virtualization */ if (rdmsrq_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; switch (rapl_model->unit_quirk) { /* * DRAM domain on HSW server and KNL has fixed energy unit which can be * different than the unit from power unit MSR. See * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ case RAPL_UNIT_QUIRK_INTEL_HSW: rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16; break; /* SPR uses a fixed energy unit for Psys domain. */ case RAPL_UNIT_QUIRK_INTEL_SPR: rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0; break; default: break; } /* * Calculate the timer rate: * Use reference of 200W for scaling the timeout to avoid counter * overflows. 200W = 200 Joules/sec * Divide interval by 2 to avoid lockstep (2 * 100) * if hw unit is 32, then we use 2 ms 1/200/2 */ rapl_timer_ms = 2; if (rapl_pkg_hw_unit[0] < 32) { rapl_timer_ms = (1000 / (2 * 100)); rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1)); } return 0; } static void __init rapl_advertise(void) { int i; int num_counters = hweight32(rapl_pmus_pkg->cntr_mask); if (rapl_pmus_core) num_counters += hweight32(rapl_pmus_core->cntr_mask); pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", num_counters, rapl_timer_ms); for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { if (rapl_pmus_pkg->cntr_mask & (1 << i)) { pr_info("hw unit of domain %s 2^-%d Joules\n", rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); } } if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE))) pr_info("hw unit of domain %s 2^-%d Joules\n", rapl_core_domain_name, rapl_core_hw_unit); } static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) { int i; for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) kfree(rapl_pmus->rapl_pmu[i]); kfree(rapl_pmus); } static const struct attribute_group *rapl_attr_update[] = { &rapl_events_cores_group, &rapl_events_pkg_group, &rapl_events_ram_group, &rapl_events_gpu_group, &rapl_events_psys_group, NULL, }; static const struct attribute_group *rapl_core_attr_update[] = { &rapl_events_core_group, NULL, }; static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) { struct rapl_pmu *rapl_pmu; int idx; for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL); if (!rapl_pmu) goto free; raw_spin_lock_init(&rapl_pmu->lock); INIT_LIST_HEAD(&rapl_pmu->active_list); rapl_pmu->pmu = &rapl_pmus->pmu; rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); rapl_hrtimer_init(rapl_pmu); rapl_pmus->rapl_pmu[idx] = rapl_pmu; } return 0; free: for (; idx > 0; idx--) kfree(rapl_pmus->rapl_pmu[idx - 1]); return -ENOMEM; } static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope, const struct attribute_group **rapl_attr_groups, const struct attribute_group **rapl_attr_update) { int nr_rapl_pmu = topology_max_packages(); struct rapl_pmus *rapl_pmus; int ret; /* * rapl_pmu_scope must be either PKG, DIE or CORE */ if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) nr_rapl_pmu *= topology_max_dies_per_package(); else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE) nr_rapl_pmu *= topology_num_cores_per_package(); else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG) return -EINVAL; rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; *rapl_pmus_ptr = rapl_pmus; rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; rapl_pmus->pmu.attr_groups = rapl_attr_groups; rapl_pmus->pmu.attr_update = rapl_attr_update; rapl_pmus->pmu.task_ctx_nr = perf_invalid_context; rapl_pmus->pmu.event_init = rapl_pmu_event_init; rapl_pmus->pmu.add = rapl_pmu_event_add; rapl_pmus->pmu.del = rapl_pmu_event_del; rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; rapl_pmus->pmu.scope = rapl_pmu_scope; rapl_pmus->pmu.module = THIS_MODULE; rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; ret = init_rapl_pmu(rapl_pmus); if (ret) kfree(rapl_pmus); return ret; } static struct rapl_model model_snb = { .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), .msr_power_unit = MSR_RAPL_POWER_UNIT, .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .msr_power_unit = MSR_RAPL_POWER_UNIT, .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), .msr_power_unit = MSR_RAPL_POWER_UNIT, .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .msr_power_unit = MSR_RAPL_POWER_UNIT, .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { .pkg_events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .msr_power_unit = MSR_RAPL_POWER_UNIT, .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), .msr_power_unit = MSR_RAPL_POWER_UNIT, .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_spr = { .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PSYS), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, .msr_power_unit = MSR_RAPL_POWER_UNIT, .rapl_pkg_msrs = intel_rapl_spr_msrs, }; static struct rapl_model model_amd_hygon = { .pkg_events = BIT(PERF_RAPL_PKG), .core_events = BIT(PERF_RAPL_CORE), .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, .rapl_pkg_msrs = amd_rapl_pkg_msrs, .rapl_core_msrs = amd_rapl_core_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon), X86_MATCH_VFM(INTEL_SANDYBRIDGE, &model_snb), X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &model_snbep), X86_MATCH_VFM(INTEL_IVYBRIDGE, &model_snb), X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &model_snbep), X86_MATCH_VFM(INTEL_HASWELL, &model_hsw), X86_MATCH_VFM(INTEL_HASWELL_X, &model_hsx), X86_MATCH_VFM(INTEL_HASWELL_L, &model_hsw), X86_MATCH_VFM(INTEL_HASWELL_G, &model_hsw), X86_MATCH_VFM(INTEL_BROADWELL, &model_hsw), X86_MATCH_VFM(INTEL_BROADWELL_G, &model_hsw), X86_MATCH_VFM(INTEL_BROADWELL_X, &model_hsx), X86_MATCH_VFM(INTEL_BROADWELL_D, &model_hsx), X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &model_knl), X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &model_knl), X86_MATCH_VFM(INTEL_SKYLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_SKYLAKE, &model_skl), X86_MATCH_VFM(INTEL_SKYLAKE_X, &model_hsx), X86_MATCH_VFM(INTEL_KABYLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_KABYLAKE, &model_skl), X86_MATCH_VFM(INTEL_CANNONLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &model_hsw), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &model_hsw), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &model_hsw), X86_MATCH_VFM(INTEL_ICELAKE_L, &model_skl), X86_MATCH_VFM(INTEL_ICELAKE, &model_skl), X86_MATCH_VFM(INTEL_ICELAKE_D, &model_hsx), X86_MATCH_VFM(INTEL_ICELAKE_X, &model_hsx), X86_MATCH_VFM(INTEL_COMETLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_COMETLAKE, &model_skl), X86_MATCH_VFM(INTEL_TIGERLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_TIGERLAKE, &model_skl), X86_MATCH_VFM(INTEL_ALDERLAKE, &model_skl), X86_MATCH_VFM(INTEL_ALDERLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &model_skl), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &model_spr), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &model_spr), X86_MATCH_VFM(INTEL_RAPTORLAKE, &model_skl), X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_skl), X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &model_skl), X86_MATCH_VFM(INTEL_METEORLAKE, &model_skl), X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &model_skl), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE; int ret; if (rapl_pkg_pmu_is_pkg_scope()) rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG; id = x86_match_cpu(rapl_model_match); if (!id) return -ENODEV; rapl_model = (struct rapl_model *) id->driver_data; ret = rapl_check_hw_unit(); if (ret) return ret; ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups, rapl_attr_update); if (ret) return ret; rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX, false, (void *) &rapl_model->pkg_events); ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); if (ret) goto out; if (rapl_model->core_events) { ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE, rapl_core_attr_groups, rapl_core_attr_update); if (ret) { pr_warn("power-core PMU initialization failed (%d)\n", ret); goto core_init_failed; } rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, PERF_RAPL_CORE_EVENTS_MAX, false, (void *) &rapl_model->core_events); ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1); if (ret) { pr_warn("power-core PMU registration failed (%d)\n", ret); cleanup_rapl_pmus(rapl_pmus_core); } } core_init_failed: rapl_advertise(); return 0; out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(rapl_pmus_pkg); return ret; } module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { if (rapl_pmus_core) { perf_pmu_unregister(&rapl_pmus_core->pmu); cleanup_rapl_pmus(rapl_pmus_core); } perf_pmu_unregister(&rapl_pmus_pkg->pmu); cleanup_rapl_pmus(rapl_pmus_pkg); } module_exit(intel_rapl_exit);
1050 1048 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-only /* * rcuref - A scalable reference count implementation for RCU managed objects * * rcuref is provided to replace open coded reference count implementations * based on atomic_t. It protects explicitely RCU managed objects which can * be visible even after the last reference has been dropped and the object * is heading towards destruction. * * A common usage pattern is: * * get() * rcu_read_lock(); * p = get_ptr(); * if (p && !atomic_inc_not_zero(&p->refcnt)) * p = NULL; * rcu_read_unlock(); * return p; * * put() * if (!atomic_dec_return(&->refcnt)) { * remove_ptr(p); * kfree_rcu((p, rcu); * } * * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has * O(N^2) behaviour under contention with N concurrent operations. * * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales * better under contention. * * Why not refcount? * ================= * * In principle it should be possible to make refcount use the rcuref * scheme, but the destruction race described below cannot be prevented * unless the protected object is RCU managed. * * Theory of operation * =================== * * rcuref uses an unsigned integer reference counter. As long as the * counter value is greater than or equal to RCUREF_ONEREF and not larger * than RCUREF_MAXREF the reference is alive: * * ONEREF MAXREF SATURATED RELEASED DEAD NOREF * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF * <---valid --------> <-------saturation zone-------> <-----dead zone-----> * * The get() and put() operations do unconditional increments and * decrements. The result is checked after the operation. This optimizes * for the fast path. * * If the reference count is saturated or dead, then the increments and * decrements are not harmful as the reference count still stays in the * respective zones and is always set back to STATURATED resp. DEAD. The * zones have room for 2^28 racing operations in each direction, which * makes it practically impossible to escape the zones. * * Once the last reference is dropped the reference count becomes * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The * slowpath then tries to set the reference count from RCUREF_NOREF to * RCUREF_DEAD via a cmpxchg(). This opens a small window where a * concurrent rcuref_get() can acquire the reference count and bring it * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. * * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in * DEAD + 1, which is inside the dead zone. If that happens the reference * count is put back to DEAD. * * The actual race is possible due to the unconditional increment and * decrements in rcuref_get() and rcuref_put(): * * T1 T2 * get() put() * if (atomic_add_negative(-1, &ref->refcnt)) * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); * * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 * * As the result of T1's add is negative, the get() goes into the slow path * and observes refcnt being in the dead zone which makes the operation fail. * * Possible critical states: * * Context Counter References Operation * T1 0 1 init() * T2 1 2 get() * T1 0 1 put() * T2 -1 0 put() tries to mark dead * T1 0 1 get() * T2 0 1 put() mark dead fails * T1 -1 0 put() tries to mark dead * T1 DEAD 0 put() mark dead succeeds * T2 DEAD+1 0 get() fails and puts it back to DEAD * * Of course there are more complex scenarios, but the above illustrates * the working principle. The rest is left to the imagination of the * reader. * * Deconstruction race * =================== * * The release operation must be protected by prohibiting a grace period in * order to prevent a possible use after free: * * T1 T2 * put() get() * // ref->refcnt = ONEREF * if (!atomic_add_negative(-1, &ref->refcnt)) * return false; <- Not taken * * // ref->refcnt == NOREF * --> preemption * // Elevates ref->refcnt to ONEREF * if (!atomic_add_negative(1, &ref->refcnt)) * return true; <- taken * * if (put(&p->ref)) { <-- Succeeds * remove_pointer(p); * kfree_rcu(p, rcu); * } * * RCU grace period ends, object is freed * * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF * * This is prevented by disabling preemption around the put() operation as * that's in most kernel configurations cheaper than a rcu_read_lock() / * rcu_read_unlock() pair and in many cases even a NOOP. In any case it * prevents the grace period which keeps the object alive until all put() * operations complete. * * Saturation protection * ===================== * * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). * Once this is exceedded the reference count becomes stale by setting it * to RCUREF_SATURATED, which will cause a memory leak, but it prevents * wrap arounds which obviously cause worse problems than a memory * leak. When saturation is reached a warning is emitted. * * Race conditions * =============== * * All reference count increment/decrement operations are unconditional and * only verified after the fact. This optimizes for the good case and takes * the occasional race vs. a dead or already saturated refcount into * account. The saturation and dead zones are large enough to accomodate * for that. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object to increase the * reference count on will provide the ordering. For locked data * structures, its the lock acquire, for RCU/lockless data structures its * the dependent load. * * rcuref_get() provides a control dependency ordering future stores which * ensures that the object is not modified when acquiring a reference * fails. * * rcuref_put() provides release order, i.e. all prior loads and stores * will be issued before. It also provides a control dependency ordering * against the subsequent destruction of the object. * * If rcuref_put() successfully dropped the last reference and marked the * object DEAD it also provides acquire ordering. */ #include <linux/export.h> #include <linux/rcuref.h> /** * rcuref_get_slowpath - Slowpath of rcuref_get() * @ref: Pointer to the reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * False if the reference count was already marked dead * * True if the reference count is saturated, which prevents the * object from being deconstructed ever. */ bool rcuref_get_slowpath(rcuref_t *ref) { unsigned int cnt = atomic_read(&ref->refcnt); /* * If the reference count was already marked dead, undo the * increment so it stays in the middle of the dead zone and return * fail. */ if (cnt >= RCUREF_RELEASED) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * If it was saturated, warn and mark it so. In case the increment * was already on a saturated value restore the saturation * marker. This keeps it in the middle of the saturation zone and * prevents the reference count from overflowing. This leaks the * object memory, but prevents the obvious reference count overflow * damage. */ if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) atomic_set(&ref->refcnt, RCUREF_SATURATED); return true; } EXPORT_SYMBOL_GPL(rcuref_get_slowpath); /** * rcuref_put_slowpath - Slowpath of __rcuref_put() * @ref: Pointer to the reference count * @cnt: The resulting value of the fastpath decrement * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt) { /* Did this drop the last reference? */ if (likely(cnt == RCUREF_NOREF)) { /* * Carefully try to set the reference count to RCUREF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > RCUREF_MAXREF) atomic_set(&ref->refcnt, RCUREF_SATURATED); return false; } EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
249 37 27416 73 6956 14306 53 2260 11 11 11 84 3 1183 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/cleanup.h> #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> #include <linux/nospec.h> #include <linux/sched.h> #include <linux/ucopysize.h> #include <asm/uaccess.h> /* * Architectures that support memory tagging (assigning tags to memory regions, * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. * * Passing down mm_struct allows to define untagging rules on per-process * basis. * * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) #endif #ifndef untagged_addr_remote #define untagged_addr_remote(mm, addr) ({ \ mmap_assert_locked(mm); \ untagged_addr(addr); \ }) #endif #ifdef masked_user_access_begin #define can_do_masked_user_access() 1 # ifndef masked_user_write_access_begin # define masked_user_write_access_begin masked_user_access_begin # endif # ifndef masked_user_read_access_begin # define masked_user_read_access_begin masked_user_access_begin #endif #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL #define masked_user_read_access_begin(src) NULL #define masked_user_write_access_begin(src) NULL #define mask_user_address(src) (src) #endif /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). * * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and * return the amount left to copy. They should assume that access_ok() has * already been checked (and succeeded); they should *not* zero-pad anything. * No KASAN or object size checks either - those belong here. * * Both of these functions should attempt to copy size bytes starting at from * into the area starting at to. They must not fetch or store anything * outside of those areas. Return value must be between 0 (everything * copied successfully) and size (nothing copied). * * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting * at to must become equal to the bytes fetched from the corresponding area * starting at from. All data past to + size - N must be left unmodified. * * If copying succeeds, the return value must be 0. If some data cannot be * fetched, it is permitted to copy less than had been fetched; the only * hard requirement is that not storing anything at all (i.e. returning size) * should happen only when nothing could be copied. In other words, you don't * have to squeeze as much as possible - it is allowed, but not necessary. * * For raw_copy_from_user() to always points to kernel memory and no faults * on store should happen. Interpretation of from is affected by set_fs(). * For raw_copy_to_user() it's the other way round. * * Both can be inlined - it's up to architectures whether it wants to bother * with that. They should not be used directly; they are used to implement * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) * that are used instead. Out of those, __... ones are inlined. Plain * copy_{to,from}_user() might or might not be inlined. If you want them * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. * * NOTE: only copy_from_user() zero-pads the destination in case of short copy. * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything * at all; their callers absolutely must check the return value. * * Biarch ones should also provide raw_copy_in_user() - similar to the above, * but both source and destination are __user pointers (affected by set_fs() * as usual) and both source and destination can trigger faults. */ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { unsigned long res; instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res; might_fault(); instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } /** * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } /* * Architectures that #define INLINE_COPY_TO_USER use this function * directly in the normal copy_to/from_user(), the other ones go * through an extern _copy_to/from_user(), which expands the same code * here. */ static inline __must_check unsigned long _inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (should_fail_usercopy()) goto fail; if (can_do_masked_user_access()) from = mask_user_address(from); else { if (!access_ok(from, n)) goto fail; /* * Ensure that bad access_ok() speculation will not * lead to nasty side effects *after* the copy is * finished: */ barrier_nospec(); } instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); if (likely(!res)) return 0; fail: memset(to + (n - res), 0, res); return res; } #ifndef INLINE_COPY_FROM_USER extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif static inline __must_check unsigned long _inline_copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } #ifndef INLINE_COPY_TO_USER extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (!check_copy_size(to, n, false)) return n; #ifdef INLINE_COPY_FROM_USER return _inline_copy_from_user(to, from, n); #else return _copy_from_user(to, from, n); #endif } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { if (!check_copy_size(from, n, true)) return n; #ifdef INLINE_COPY_TO_USER return _inline_copy_to_user(to, from, n); #else return _copy_to_user(to, from, n); #endif } #ifndef copy_mc_to_kernel /* * Without arch opt-in this generic copy_mc_to_kernel() will not handle * #MC (or arch equivalent) during source read. */ static inline unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); return 0; } #endif static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ static inline bool pagefault_disabled(void) { return current->pagefault_disabled != 0; } /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) DEFINE_LOCK_GUARD_0(pagefault, pagefault_disable(), pagefault_enable()) #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS /** * probe_subpage_writeable: probe the user range for write faults at sub-page * granularity (e.g. arm64 MTE) * @uaddr: start of address range * @size: size of address range * * Returns 0 on success, the number of bytes not probed on fault. * * It is expected that the caller checked for the write permission of each * page in the range either by put_user() or GUP. The architecture port can * implement a more efficient get_user() probing if the same sub-page faults * are triggered by either a read or a write. */ static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) { return 0; } #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); /** * copy_struct_from_user: copy a struct from userspace * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @src: Source address, in userspace. * @usize: (Alleged) size of @src struct. * * Copies a struct from userspace to kernel space, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) * { * int err; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); * if (err) * return err; * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the userspace has passed an old struct to a * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) * are to be zero-filled. * * If @usize > @ksize, then the userspace has passed a new struct to an * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) * are checked to ensure they are zeroed, otherwise -E2BIG is returned. * * Returns (in all cases, some data may have been copied): * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_from_user(void *dst, size_t ksize, const void __user *src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { int ret = check_zeroed_user(src + size, rest); if (ret <= 0) return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ if (copy_from_user(dst, src, size)) return -EFAULT; return 0; } /** * copy_struct_to_user: copy a struct to userspace * @dst: Destination address, in userspace. This buffer must be @ksize * bytes long. * @usize: (Alleged) size of @dst struct. * @src: Source address, in kernel space. * @ksize: Size of @src struct. * @ignored_trailing: Set to %true if there was a non-zero byte in @src that * userspace cannot see because they are using an smaller struct. * * Copies a struct from kernel space to userspace, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * Some syscalls may wish to make sure that userspace knows about everything in * the struct, and if there is a non-zero value that userspce doesn't know * about, they want to return an error (such as -EMSGSIZE) or have some other * fallback (such as adding a "you're missing some information" flag). If * @ignored_trailing is non-%NULL, it will be set to %true if there was a * non-zero byte that could not be copied to userspace (ie. was past @usize). * * While unconditionally returning an error in this case is the simplest * solution, for maximum backward compatibility you should try to only return * -EMSGSIZE if the user explicitly requested the data that couldn't be copied. * Note that structure sizes can change due to header changes and simple * recompilations without code changes(!), so if you care about * @ignored_trailing you probably want to make sure that any new field data is * associated with a flag. Otherwise you might assume that a program knows * about data it does not. * * @ksize is just sizeof(*src), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize) * { * int err; * bool ignored_trailing; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * // ... modify karg somehow ... * * err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg), * &ignored_trailing); * if (err) * return err; * if (ignored_trailing) * return -EMSGSIZE: * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the kernel is trying to pass userspace a newer * struct than it supports. Thus we only copy the interoperable portions * (@usize) and ignore the rest (but @ignored_trailing is set to %true if * any of the trailing (@ksize - @usize) bytes are non-zero). * * If @usize > @ksize, then the kernel is trying to pass userspace an older * struct than userspace supports. In order to make sure the * unknown-to-the-kernel fields don't contain garbage values, we zero the * trailing (@usize - @ksize) bytes. * * Returns (in all cases, some data may have been copied): * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_to_user(void __user *dst, size_t usize, const void *src, size_t ksize, bool *ignored_trailing) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize > ksize) { if (clear_user(dst + size, rest)) return -EFAULT; } if (ignored_trailing) *ignored_trailing = ksize < usize && memchr_inv(src + size, 0, rest) != NULL; /* Copy the interoperable parts of the struct. */ if (copy_to_user(dst, src, size)) return -EFAULT; return 0; } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size); long copy_from_user_nofault(void *dst, const void __user *src, size_t size); long notrace copy_to_user_nofault(void __user *dst, const void *src, size_t size); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); #ifdef arch_get_kernel_nofault /* * Wrap the architecture implementation so that @label can be outside of a * cleanup() scope. A regular C goto works correctly, but ASM goto does * not. Clang rejects such an attempt, but GCC silently emits buggy code. */ #define __get_kernel_nofault(dst, src, type, label) \ do { \ __label__ local_label; \ arch_get_kernel_nofault(dst, src, type, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ __label__ local_label; \ arch_put_kernel_nofault(dst, src, type, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */ #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ type data; \ if (__get_user(data, p)) \ goto label; \ *(type *)dst = data; \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(dst); \ type data = *(type *)src; \ if (__put_user(data, p)) \ goto label; \ } while (0) #endif /* !__get_kernel_nofault */ /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable * @ptr: address to read from * * Returns 0 on success, or -EFAULT. */ #define get_kernel_nofault(val, ptr) ({ \ const typeof(val) *__gk_ptr = (ptr); \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) #ifdef user_access_begin #ifdef arch_unsafe_get_user /* * Wrap the architecture implementation so that @label can be outside of a * cleanup() scope. A regular C goto works correctly, but ASM goto does * not. Clang rejects such an attempt, but GCC silently emits buggy code. * * Some architectures use internal local labels already, but this extra * indirection here is harmless because the compiler optimizes it out * completely in any case. This construct just ensures that the ASM GOTO * target is always in the local scope. The C goto 'label' works correctly * when leaving a cleanup() scope. */ #define unsafe_get_user(x, ptr, label) \ do { \ __label__ local_label; \ arch_unsafe_get_user(x, ptr, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #define unsafe_put_user(x, ptr, label) \ do { \ __label__ local_label; \ arch_unsafe_put_user(x, ptr, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #endif /* arch_unsafe_get_user */ #else /* user_access_begin */ #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif /* !user_access_begin */ #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end #endif #ifndef user_read_access_begin #define user_read_access_begin user_access_begin #define user_read_access_end user_access_end #endif /* Define RW variant so the below _mode macro expansion works */ #define masked_user_rw_access_begin(u) masked_user_access_begin(u) #define user_rw_access_begin(u, s) user_access_begin(u, s) #define user_rw_access_end() user_access_end() /* Scoped user access */ #define USER_ACCESS_GUARD(_mode) \ static __always_inline void __user * \ class_user_##_mode##_begin(void __user *ptr) \ { \ return ptr; \ } \ \ static __always_inline void \ class_user_##_mode##_end(void __user *ptr) \ { \ user_##_mode##_access_end(); \ } \ \ DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ class_user_##_mode##_end(_T), \ class_user_##_mode##_begin(ptr), void __user *ptr) \ \ static __always_inline class_user_##_mode##_access_t \ class_user_##_mode##_access_ptr(void __user *scope) \ { \ return scope; \ } USER_ACCESS_GUARD(read) USER_ACCESS_GUARD(write) USER_ACCESS_GUARD(rw) #undef USER_ACCESS_GUARD /** * __scoped_user_access_begin - Start a scoped user access * @mode: The mode of the access class (read, write, rw) * @uptr: The pointer to access user space memory * @size: Size of the access * @elbl: Error label to goto when the access region is rejected * * Internal helper for __scoped_user_access(). Don't use directly. */ #define __scoped_user_access_begin(mode, uptr, size, elbl) \ ({ \ typeof(uptr) __retptr; \ \ if (can_do_masked_user_access()) { \ __retptr = masked_user_##mode##_access_begin(uptr); \ } else { \ __retptr = uptr; \ if (!user_##mode##_access_begin(uptr, size)) \ goto elbl; \ } \ __retptr; \ }) /** * __scoped_user_access - Open a scope for user access * @mode: The mode of the access class (read, write, rw) * @uptr: The pointer to access user space memory * @size: Size of the access * @elbl: Error label to goto when the access region is rejected. It * must be placed outside the scope * * If the user access function inside the scope requires a fault label, it * can use @elbl or a different label outside the scope, which requires * that user access which is implemented with ASM GOTO has been properly * wrapped. See unsafe_get_user() for reference. * * scoped_user_rw_access(ptr, efault) { * unsafe_get_user(rval, &ptr->rval, efault); * unsafe_put_user(wval, &ptr->wval, efault); * } * return 0; * efault: * return -EFAULT; * * The scope is internally implemented as a autoterminating nested for() * loop, which can be left with 'return', 'break' and 'goto' at any * point. * * When the scope is left user_##@_mode##_access_end() is automatically * invoked. * * When the architecture supports masked user access and the access region * which is determined by @uptr and @size is not a valid user space * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user * space address and does not terminate early. This optimizes for the good * case and lets the performance uncritical bad case go through the fault. * * The eventual modification of the pointer is limited to the scope. * Outside of the scope the original pointer value is unmodified, so that * the original pointer value is available for diagnostic purposes in an * out of scope fault path. * * Nesting scoped user access into a user access scope is invalid and fails * the build. Nesting into other guards, e.g. pagefault is safe. * * The masked variant does not check the size of the access and relies on a * mapping hole (e.g. guard page) to catch an out of range pointer, the * first access to user memory inside the scope has to be within * @uptr ... @uptr + PAGE_SIZE - 1 * * Don't use directly. Use scoped_masked_user_$MODE_access() instead. */ #define __scoped_user_access(mode, uptr, size, elbl) \ for (bool done = false; !done; done = true) \ for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ !done; done = true) \ for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ /* Force modified pointer usage within the scope */ \ for (const typeof(uptr) uptr = _tmpptr; !done; done = true) /** * scoped_user_read_access_size - Start a scoped user read access with given size * @usrc: Pointer to the user space address to read from * @size: Size of the access starting from @usrc * @elbl: Error label to goto when the access region is rejected * * For further information see __scoped_user_access() above. */ #define scoped_user_read_access_size(usrc, size, elbl) \ __scoped_user_access(read, usrc, size, elbl) /** * scoped_user_read_access - Start a scoped user read access * @usrc: Pointer to the user space address to read from * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @usrc is determined via sizeof(*@usrc)). * * For further information see __scoped_user_access() above. */ #define scoped_user_read_access(usrc, elbl) \ scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl) /** * scoped_user_write_access_size - Start a scoped user write access with given size * @udst: Pointer to the user space address to write to * @size: Size of the access starting from @udst * @elbl: Error label to goto when the access region is rejected * * For further information see __scoped_user_access() above. */ #define scoped_user_write_access_size(udst, size, elbl) \ __scoped_user_access(write, udst, size, elbl) /** * scoped_user_write_access - Start a scoped user write access * @udst: Pointer to the user space address to write to * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @udst is determined via sizeof(*@udst)). * * For further information see __scoped_user_access() above. */ #define scoped_user_write_access(udst, elbl) \ scoped_user_write_access_size(udst, sizeof(*(udst)), elbl) /** * scoped_user_rw_access_size - Start a scoped user read/write access with given size * @uptr Pointer to the user space address to read from and write to * @size: Size of the access starting from @uptr * @elbl: Error label to goto when the access region is rejected * * For further information see __scoped_user_access() above. */ #define scoped_user_rw_access_size(uptr, size, elbl) \ __scoped_user_access(rw, uptr, size, elbl) /** * scoped_user_rw_access - Start a scoped user read/write access * @uptr Pointer to the user space address to read from and write to * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @uptr is determined via sizeof(*@uptr)). * * For further information see __scoped_user_access() above. */ #define scoped_user_rw_access(uptr, elbl) \ scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) /** * get_user_inline - Read user data inlined * @val: The variable to store the value read from user memory * @usrc: Pointer to the user space memory to read from * * Return: 0 if successful, -EFAULT when faulted * * Inlined variant of get_user(). Only use when there is a demonstrable * performance reason. */ #define get_user_inline(val, usrc) \ ({ \ __label__ efault; \ typeof(usrc) _tmpsrc = usrc; \ int _ret = 0; \ \ scoped_user_read_access(_tmpsrc, efault) \ unsafe_get_user(val, _tmpsrc, efault); \ if (0) { \ efault: \ _ret = -EFAULT; \ } \ _ret; \ }) /** * put_user_inline - Write to user memory inlined * @val: The value to write * @udst: Pointer to the user space memory to write to * * Return: 0 if successful, -EFAULT when faulted * * Inlined variant of put_user(). Only use when there is a demonstrable * performance reason. */ #define put_user_inline(val, udst) \ ({ \ __label__ efault; \ typeof(udst) _tmpdst = udst; \ int _ret = 0; \ \ scoped_user_write_access(_tmpdst, efault) \ unsafe_put_user(val, _tmpdst, efault); \ if (0) { \ efault: \ _ret = -EFAULT; \ } \ _ret; \ }) #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); #endif #endif /* __LINUX_UACCESS_H__ */
5 5 5 5 5 5 5 13 13 13 32 12 12 20 13 7 20 20 20 20 1 20 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "queueing.h" #include "socket.h" #include "timers.h" #include "device.h" #include "ratelimiter.h" #include "peer.h" #include "messages.h" #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> #include <linux/suspend.h> #include <net/dst_metadata.h> #include <net/gso.h> #include <net/icmp.h> #include <net/rtnetlink.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> static LIST_HEAD(device_list); static int wg_open(struct net_device *dev) { struct in_device *dev_v4 = __in_dev_get_rtnl(dev); struct inet6_dev *dev_v6 = __in6_dev_get(dev); struct wg_device *wg = netdev_priv(dev); struct wg_peer *peer; int ret; if (dev_v4) { /* At some point we might put this check near the ip_rt_send_ * redirect call of ip_forward in net/ipv4/ip_forward.c, similar * to the current secpath check. */ IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false); IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false; } if (dev_v6) dev_v6->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_NONE; mutex_lock(&wg->device_update_lock); ret = wg_socket_init(wg, wg->incoming_port); if (ret < 0) goto out; list_for_each_entry(peer, &wg->peer_list, peer_list) { wg_packet_send_staged_packets(peer); if (peer->persistent_keepalive_interval) wg_packet_send_keepalive(peer); } out: mutex_unlock(&wg->device_update_lock); return ret; } static int wg_pm_notification(struct notifier_block *nb, unsigned long action, void *data) { struct wg_device *wg; struct wg_peer *peer; /* If the machine is constantly suspending and resuming, as part of * its normal operation rather than as a somewhat rare event, then we * don't actually want to clear keys. */ if (IS_ENABLED(CONFIG_PM_AUTOSLEEP) || IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)) return 0; if (action != PM_HIBERNATION_PREPARE && action != PM_SUSPEND_PREPARE) return 0; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) { timer_delete(&peer->timer_zero_key_material); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); } mutex_unlock(&wg->device_update_lock); } rtnl_unlock(); rcu_barrier(); return 0; } static struct notifier_block pm_notifier = { .notifier_call = wg_pm_notification }; static int wg_vm_notification(struct notifier_block *nb, unsigned long action, void *data) { struct wg_device *wg; struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_noise_expire_current_peer_keypairs(peer); mutex_unlock(&wg->device_update_lock); } rtnl_unlock(); return 0; } static struct notifier_block vm_notifier = { .notifier_call = wg_vm_notification }; static int wg_stop(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); struct wg_peer *peer; struct sk_buff *skb; mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) { wg_packet_purge_staged_packets(peer); wg_timers_stop(peer); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); } mutex_unlock(&wg->device_update_lock); while ((skb = ptr_ring_consume(&wg->handshake_queue.ring)) != NULL) kfree_skb(skb); atomic_set(&wg->handshake_queue_len, 0); wg_socket_reinit(wg, NULL, NULL); return 0; } static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); struct sk_buff_head packets; struct wg_peer *peer; struct sk_buff *next; sa_family_t family; u32 mtu; int ret; if (unlikely(!wg_check_packet_protocol(skb))) { ret = -EPROTONOSUPPORT; net_dbg_ratelimited("%s: Invalid IP packet\n", dev->name); goto err; } peer = wg_allowedips_lookup_dst(&wg->peer_allowedips, skb); if (unlikely(!peer)) { ret = -ENOKEY; if (skb->protocol == htons(ETH_P_IP)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI4\n", dev->name, &ip_hdr(skb)->daddr); else if (skb->protocol == htons(ETH_P_IPV6)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n", dev->name, &ipv6_hdr(skb)->daddr); goto err_icmp; } family = READ_ONCE(peer->endpoint.addr.sa_family); if (unlikely(family != AF_INET && family != AF_INET6)) { ret = -EDESTADDRREQ; net_dbg_ratelimited("%s: No valid endpoint has been configured or discovered for peer %llu\n", dev->name, peer->internal_id); goto err_peer; } mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; __skb_queue_head_init(&packets); if (!skb_is_gso(skb)) { skb_mark_not_on_list(skb); } else { struct sk_buff *segs = skb_gso_segment(skb, 0); if (IS_ERR(segs)) { ret = PTR_ERR(segs); goto err_peer; } dev_kfree_skb(skb); skb = segs; } skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) continue; /* We only need to keep the original dst around for icmp, * so at this point we're in a position to drop it. */ skb_dst_drop(skb); PACKET_CB(skb)->mtu = mtu; __skb_queue_tail(&packets, skb); } spin_lock_bh(&peer->staged_packet_queue.lock); /* If the queue is getting too big, we start removing the oldest packets * until it's small again. We do this before adding the new packet, so * we don't remove GSO segments that are in excess. */ while (skb_queue_len(&peer->staged_packet_queue) > MAX_STAGED_PACKETS) { dev_kfree_skb(__skb_dequeue(&peer->staged_packet_queue)); DEV_STATS_INC(dev, tx_dropped); } skb_queue_splice_tail(&packets, &peer->staged_packet_queue); spin_unlock_bh(&peer->staged_packet_queue.lock); wg_packet_send_staged_packets(peer); wg_peer_put(peer); return NETDEV_TX_OK; err_peer: wg_peer_put(peer); err_icmp: if (skb->protocol == htons(ETH_P_IP)) icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return ret; } static const struct net_device_ops netdev_ops = { .ndo_open = wg_open, .ndo_stop = wg_stop, .ndo_start_xmit = wg_xmit, }; static void wg_destruct(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); rtnl_lock(); list_del(&wg->device_list); rtnl_unlock(); mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg->incoming_port = 0; wg_socket_reinit(wg, NULL, NULL); /* The final references are cleared in the below calls to destroy_workqueue. */ wg_peer_remove_all(wg); destroy_workqueue(wg->handshake_receive_wq); destroy_workqueue(wg->handshake_send_wq); destroy_workqueue(wg->packet_crypt_wq); wg_packet_queue_free(&wg->handshake_queue, true); wg_packet_queue_free(&wg->decrypt_queue, false); wg_packet_queue_free(&wg->encrypt_queue, false); rcu_barrier(); /* Wait for all the peers to be actually freed. */ wg_ratelimiter_uninit(); memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); kvfree(wg->index_hashtable); kvfree(wg->peer_hashtable); mutex_unlock(&wg->device_update_lock); pr_debug("%s: Interface destroyed\n", dev->name); free_netdev(dev); } static const struct device_type device_type = { .name = KBUILD_MODNAME }; static void wg_setup(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO | NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA }; const int overhead = MESSAGE_MINIMUM_LENGTH + sizeof(struct udphdr) + max(sizeof(struct ipv6hdr), sizeof(struct iphdr)); dev->netdev_ops = &netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->hard_header_len = 0; dev->addr_len = 0; dev->needed_headroom = DATA_PACKET_HEAD_ROOM; dev->needed_tailroom = noise_encrypted_len(MESSAGE_PADDING_MULTIPLE); dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->features |= WG_NETDEV_FEATURES; dev->hw_features |= WG_NETDEV_FEATURES; dev->hw_enc_features |= WG_NETDEV_FEATURES; dev->mtu = ETH_DATA_LEN - overhead; dev->max_mtu = round_down(INT_MAX, MESSAGE_PADDING_MULTIPLE) - overhead; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; SET_NETDEV_DEVTYPE(dev, &device_type); /* We need to keep the dst around in case of icmp replies. */ netif_keep_dst(dev); netif_set_tso_max_size(dev, GSO_MAX_SIZE); wg->dev = dev; } static int wg_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct wg_device *wg = netdev_priv(dev); int ret = -ENOMEM; rcu_assign_pointer(wg->creating_net, link_net); init_rwsem(&wg->static_identity.lock); mutex_init(&wg->socket_update_lock); mutex_init(&wg->device_update_lock); wg_allowedips_init(&wg->peer_allowedips); wg_cookie_checker_init(&wg->cookie_checker, wg); INIT_LIST_HEAD(&wg->peer_list); wg->device_update_gen = 1; wg->peer_hashtable = wg_pubkey_hashtable_alloc(); if (!wg->peer_hashtable) return ret; wg->index_hashtable = wg_index_hashtable_alloc(); if (!wg->index_hashtable) goto err_free_peer_hashtable; wg->handshake_receive_wq = alloc_workqueue("wg-kex-%s", WQ_CPU_INTENSIVE | WQ_FREEZABLE | WQ_PERCPU, 0, dev->name); if (!wg->handshake_receive_wq) goto err_free_index_hashtable; wg->handshake_send_wq = alloc_workqueue("wg-kex-%s", WQ_UNBOUND | WQ_FREEZABLE, 0, dev->name); if (!wg->handshake_send_wq) goto err_destroy_handshake_receive; wg->packet_crypt_wq = alloc_workqueue("wg-crypt-%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_PERCPU, 0, dev->name); if (!wg->packet_crypt_wq) goto err_destroy_handshake_send; ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, MAX_QUEUED_PACKETS); if (ret < 0) goto err_destroy_packet_crypt; ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, MAX_QUEUED_PACKETS); if (ret < 0) goto err_free_encrypt_queue; ret = wg_packet_queue_init(&wg->handshake_queue, wg_packet_handshake_receive_worker, MAX_QUEUED_INCOMING_HANDSHAKES); if (ret < 0) goto err_free_decrypt_queue; ret = wg_ratelimiter_init(); if (ret < 0) goto err_free_handshake_queue; netif_threaded_enable(dev); ret = register_netdevice(dev); if (ret < 0) goto err_uninit_ratelimiter; list_add(&wg->device_list, &device_list); /* We wait until the end to assign priv_destructor, so that * register_netdevice doesn't call it for us if it fails. */ dev->priv_destructor = wg_destruct; pr_debug("%s: Interface created\n", dev->name); return ret; err_uninit_ratelimiter: wg_ratelimiter_uninit(); err_free_handshake_queue: wg_packet_queue_free(&wg->handshake_queue, false); err_free_decrypt_queue: wg_packet_queue_free(&wg->decrypt_queue, false); err_free_encrypt_queue: wg_packet_queue_free(&wg->encrypt_queue, false); err_destroy_packet_crypt: destroy_workqueue(wg->packet_crypt_wq); err_destroy_handshake_send: destroy_workqueue(wg->handshake_send_wq); err_destroy_handshake_receive: destroy_workqueue(wg->handshake_receive_wq); err_free_index_hashtable: kvfree(wg->index_hashtable); err_free_peer_hashtable: kvfree(wg->peer_hashtable); return ret; } static struct rtnl_link_ops link_ops __read_mostly = { .kind = KBUILD_MODNAME, .priv_size = sizeof(struct wg_device), .setup = wg_setup, .newlink = wg_newlink, }; static void wg_netns_pre_exit(struct net *net) { struct wg_device *wg; struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { if (rcu_access_pointer(wg->creating_net) == net) { pr_debug("%s: Creating namespace exiting\n", wg->dev->name); netif_carrier_off(wg->dev); mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg_socket_reinit(wg, NULL, NULL); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); mutex_unlock(&wg->device_update_lock); } } rtnl_unlock(); } static struct pernet_operations pernet_ops = { .pre_exit = wg_netns_pre_exit }; int __init wg_device_init(void) { int ret; ret = register_pm_notifier(&pm_notifier); if (ret) return ret; ret = register_random_vmfork_notifier(&vm_notifier); if (ret) goto error_pm; ret = register_pernet_device(&pernet_ops); if (ret) goto error_vm; ret = rtnl_link_register(&link_ops); if (ret) goto error_pernet; return 0; error_pernet: unregister_pernet_device(&pernet_ops); error_vm: unregister_random_vmfork_notifier(&vm_notifier); error_pm: unregister_pm_notifier(&pm_notifier); return ret; } void wg_device_uninit(void) { rtnl_link_unregister(&link_ops); unregister_pernet_device(&pernet_ops); unregister_random_vmfork_notifier(&vm_notifier); unregister_pm_notifier(&pm_notifier); rcu_barrier(); }
25 1 1 4 1 12 1 5 2 4 3 1 15 3 12 9 3 10 2 12 12 12 11 11 11 11 8 3 9 2 11 4 3 1 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0+ /* net/sched/act_ctinfo.c netfilter ctinfo connmark actions * * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/pkt_cls.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ctinfo.h> #include <net/tc_act/tc_ctinfo.h> #include <net/tc_wrapper.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> static struct tc_action_ops act_ctinfo_ops; static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb, int wlen, int proto) { u8 dscp, newdscp; newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) & ~INET_ECN_MASK; switch (proto) { case NFPROTO_IPV4: dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; if (dscp != newdscp) { if (likely(!skb_try_make_writable(skb, wlen))) { ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, newdscp); atomic64_inc(&ca->stats_dscp_set); } else { atomic64_inc(&ca->stats_dscp_error); } } break; case NFPROTO_IPV6: dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; if (dscp != newdscp) { if (likely(!skb_try_make_writable(skb, wlen))) { ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, newdscp); atomic64_inc(&ca->stats_dscp_set); } else { atomic64_inc(&ca->stats_dscp_error); } } break; default: break; } } static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb) { atomic64_inc(&ca->stats_cpmark_set); skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash = NULL; struct tcf_ctinfo *ca = to_ctinfo(a); struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; enum ip_conntrack_info ctinfo; struct tcf_ctinfo_params *cp; struct nf_conn *ct; int proto, wlen; cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV4; break; case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV6; break; default: goto out; } ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* look harder, usually ingress */ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, cp->net, &tuple)) goto out; zone.id = cp->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; thash = nf_conntrack_find_get(cp->net, &zone, &tuple); if (!thash) goto out; ct = nf_ct_tuplehash_to_ctrack(thash); } if (cp->mode & CTINFO_MODE_DSCP) if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask)) tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); if (cp->mode & CTINFO_MODE_CPMARK) tcf_ctinfo_cpmark_set(ct, ca, cp, skb); if (thash) nf_ct_put(ct); out: return cp->action; } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { [TCA_CTINFO_ACT] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)), [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 }, }; static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; u32 dscpmask = 0, dscpstatemask, index; struct nlattr *tb[TCA_CTINFO_MAX + 1]; struct tcf_ctinfo_params *cp_new; struct tcf_chain *goto_ch = NULL; struct tc_ctinfo *actparm; struct tcf_ctinfo *ci; u8 dscpmaskshift; int ret = 0, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack); if (err < 0) return err; if (!tb[TCA_CTINFO_ACT]) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_CTINFO_ACT attribute"); return -EINVAL; } actparm = nla_data(tb[TCA_CTINFO_ACT]); /* do some basic validation here before dynamically allocating things */ /* that we would otherwise have to clean up. */ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) { dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]); /* need contiguous 6 bit mask */ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0; if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CTINFO_PARMS_DSCP_MASK], "dscp mask must be 6 contiguous bits"); return -EINVAL; } dscpstatemask = nla_get_u32_default(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], 0); /* mask & statemask must not overlap */ if (dscpmask & dscpstatemask) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], "dscp statemask must not overlap dscp mask"); return -EINVAL; } } /* done the validation:now to the actual action allocation */ index = actparm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_ctinfo_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) /* don't override defaults */ return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } else { return err; } err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; ci = to_ctinfo(*a); cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); if (unlikely(!cp_new)) { err = -ENOMEM; goto put_chain; } cp_new->net = net; cp_new->zone = nla_get_u16_default(tb[TCA_CTINFO_ZONE], 0); if (dscpmask) { cp_new->dscpmask = dscpmask; cp_new->dscpmaskshift = dscpmaskshift; cp_new->dscpstatemask = dscpstatemask; cp_new->mode |= CTINFO_MODE_DSCP; } if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) { cp_new->cpmarkmask = nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]); cp_new->mode |= CTINFO_MODE_CPMARK; } cp_new->action = actparm->action; spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, lockdep_is_held(&ci->tcf_lock)); spin_unlock_bh(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (cp_new) kfree_rcu(cp_new, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { const struct tcf_ctinfo *ci = to_ctinfo(a); unsigned char *b = skb_tail_pointer(skb); const struct tcf_ctinfo_params *cp; struct tc_ctinfo opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; struct tcf_t t; rcu_read_lock(); cp = rcu_dereference(ci->params); tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) goto nla_put_failure; opt.action = cp->action; if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone)) goto nla_put_failure; if (cp->mode & CTINFO_MODE_DSCP) { if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK, cp->dscpmask)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK, cp->dscpstatemask)) goto nla_put_failure; } if (cp->mode & CTINFO_MODE_CPMARK) { if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK, cp->cpmarkmask)) goto nla_put_failure; } if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, atomic64_read(&ci->stats_dscp_set), TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, atomic64_read(&ci->stats_dscp_error), TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, atomic64_read(&ci->stats_cpmark_set), TCA_CTINFO_PAD)) goto nla_put_failure; rcu_read_unlock(); return skb->len; nla_put_failure: rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } static void tcf_ctinfo_cleanup(struct tc_action *a) { struct tcf_ctinfo *ci = to_ctinfo(a); struct tcf_ctinfo_params *cp; cp = rcu_dereference_protected(ci->params, 1); if (cp) kfree_rcu(cp, rcu); } static struct tc_action_ops act_ctinfo_ops = { .kind = "ctinfo", .id = TCA_ID_CTINFO, .owner = THIS_MODULE, .act = tcf_ctinfo_act, .dump = tcf_ctinfo_dump, .init = tcf_ctinfo_init, .cleanup= tcf_ctinfo_cleanup, .size = sizeof(struct tcf_ctinfo), }; MODULE_ALIAS_NET_ACT("ctinfo"); static __net_init int ctinfo_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); return tc_action_net_init(net, tn, &act_ctinfo_ops); } static void __net_exit ctinfo_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_ctinfo_ops.net_id); } static struct pernet_operations ctinfo_net_ops = { .init = ctinfo_init_net, .exit_batch = ctinfo_exit_net, .id = &act_ctinfo_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init ctinfo_init_module(void) { return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops); } static void __exit ctinfo_cleanup_module(void) { tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops); } module_init(ctinfo_init_module); module_exit(ctinfo_cleanup_module); MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>"); MODULE_DESCRIPTION("Connection tracking mark actions"); MODULE_LICENSE("GPL");
5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2021 NXP */ #include "netlink.h" #include "common.h" struct phc_vclocks_req_info { struct ethnl_req_info base; }; struct phc_vclocks_reply_data { struct ethnl_reply_data base; int num; int *index; }; #define PHC_VCLOCKS_REPDATA(__reply_base) \ container_of(__reply_base, struct phc_vclocks_reply_data, base) const struct nla_policy ethnl_phc_vclocks_get_policy[] = { [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->num = ethtool_get_phc_vclocks(dev, &data->index); ethnl_ops_complete(dev); return ret; } static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); int len = 0; if (data->num > 0) { len += nla_total_size(sizeof(u32)); len += nla_total_size(sizeof(s32) * data->num); } return len; } static int phc_vclocks_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); if (data->num <= 0) return 0; if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) || nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX, sizeof(s32) * data->num, data->index)) return -EMSGSIZE; return 0; } static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); kfree(data->index); } const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = { .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER, .req_info_size = sizeof(struct phc_vclocks_req_info), .reply_data_size = sizeof(struct phc_vclocks_reply_data), .prepare_data = phc_vclocks_prepare_data, .reply_size = phc_vclocks_reply_size, .fill_reply = phc_vclocks_fill_reply, .cleanup_data = phc_vclocks_cleanup_data, };
16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC virtual connection handler, common bits. * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/skbuff.h> #include "ar-internal.h" /* * Time till a connection expires after last use (in seconds). */ unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60; unsigned int __read_mostly rxrpc_closed_conn_expiry = 10; static void rxrpc_clean_up_connection(struct work_struct *work); static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, unsigned long reap_at); void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { struct rxrpc_local *local = conn->local; bool busy; if (WARN_ON_ONCE(!local)) return; spin_lock_irq(&local->lock); busy = !list_empty(&conn->attend_link); if (!busy) { rxrpc_get_connection(conn, why); list_add_tail(&conn->attend_link, &local->conn_attend_q); } spin_unlock_irq(&local->lock); rxrpc_wake_up_io_thread(local); } static void rxrpc_connection_timer(struct timer_list *timer) { struct rxrpc_connection *conn = container_of(timer, struct rxrpc_connection, timer); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_timer); } /* * allocate a new connection */ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, gfp_t gfp) { struct rxrpc_connection *conn; _enter(""); conn = kzalloc(sizeof(struct rxrpc_connection), gfp); if (conn) { INIT_LIST_HEAD(&conn->cache_link); timer_setup(&conn->timer, &rxrpc_connection_timer, 0); INIT_WORK(&conn->processor, rxrpc_process_connection); INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); INIT_LIST_HEAD(&conn->attend_link); mutex_init(&conn->security_lock); mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; rwlock_init(&conn->security_use_lock); spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->idle_timestamp = jiffies; } _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); return conn; } /* * Look up a connection in the cache by protocol parameters. * * If successful, a pointer to the connection is returned, but no ref is taken. * NULL is returned if there is no match. * * When searching for a service call, if we find a peer but no connection, we * return that through *_peer in case we need to create a new service call. * * The caller must be holding the RCU read lock. */ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, struct sk_buff *skb) { struct rxrpc_connection *conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_peer *peer; _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); /* Look up client connections by connection ID alone as their * IDs are unique for this machine. */ conn = idr_find(&local->conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); if (!conn || refcount_read(&conn->ref) == 0) { _debug("no conn"); goto not_found; } if (conn->proto.epoch != sp->hdr.epoch || conn->local != local) goto not_found; peer = conn->peer; switch (srx->transport.family) { case AF_INET: if (peer->srx.transport.sin.sin_port != srx->transport.sin.sin_port) goto not_found; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (peer->srx.transport.sin6.sin6_port != srx->transport.sin6.sin6_port) goto not_found; break; #endif default: BUG(); } _leave(" = %p", conn); return conn; not_found: _leave(" = NULL"); return NULL; } /* * Disconnect a call and clear any channel it occupies when that call * terminates. The caller must hold the channel_lock and must release the * call's ref on the connection. */ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, struct rxrpc_call *call) { struct rxrpc_channel *chan = &conn->channels[call->cid & RXRPC_CHANNELMASK]; _enter("%d,%x", conn->debug_id, call->cid); if (chan->call == call) { /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. */ trace_rxrpc_disconnect_call(call); switch (call->completion) { case RXRPC_CALL_SUCCEEDED: chan->last_seq = call->rx_highest_seq; chan->last_type = RXRPC_PACKET_TYPE_ACK; break; case RXRPC_CALL_LOCALLY_ABORTED: chan->last_abort = call->abort_code; chan->last_type = RXRPC_PACKET_TYPE_ABORT; break; default: chan->last_abort = RX_CALL_DEAD; chan->last_type = RXRPC_PACKET_TYPE_ABORT; break; } chan->last_call = chan->call_id; chan->call_id = chan->call_counter; chan->call = NULL; } _leave(""); } /* * Disconnect a call and clear any channel it occupies when that call * terminates. */ void rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); rxrpc_see_call(call, rxrpc_call_see_disconnected); call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { spin_lock_irq(&call->peer->lock); hlist_del_init(&call->error_link); spin_unlock_irq(&call->peer->lock); } if (rxrpc_is_client_call(call)) { rxrpc_disconnect_client_call(call->bundle, call); } else { __rxrpc_disconnect_call(conn, call); conn->idle_timestamp = jiffies; if (atomic_dec_and_test(&conn->active)) rxrpc_set_service_reap_timer(conn->rxnet, jiffies + rxrpc_connection_expiry * HZ); } rxrpc_put_call(call, rxrpc_call_put_io_thread); } /* * Queue a connection's work processor, getting a ref to pass to the work * queue. */ void rxrpc_queue_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { if (atomic_read(&conn->active) >= 0 && rxrpc_queue_work(&conn->processor)) rxrpc_see_connection(conn, why); } /* * Note the re-emergence of a connection. */ void rxrpc_see_connection(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { if (conn) { int r = refcount_read(&conn->ref); trace_rxrpc_conn(conn->debug_id, r, why); } } /* * Get a ref on a connection. */ struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { int r; __refcount_inc(&conn->ref, &r); trace_rxrpc_conn(conn->debug_id, r + 1, why); return conn; } /* * Try to get a ref on a connection. */ struct rxrpc_connection * rxrpc_get_connection_maybe(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { int r; if (conn) { if (__refcount_inc_not_zero(&conn->ref, &r)) trace_rxrpc_conn(conn->debug_id, r + 1, why); else conn = NULL; } return conn; } /* * Set the service connection reap timer. */ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, unsigned long reap_at) { if (rxnet->live) timer_reduce(&rxnet->service_conn_reap_timer, reap_at); } /* * destroy a virtual connection */ static void rxrpc_rcu_free_connection(struct rcu_head *rcu) { struct rxrpc_connection *conn = container_of(rcu, struct rxrpc_connection, rcu); struct rxrpc_net *rxnet = conn->rxnet; _enter("{%d,u=%d}", conn->debug_id, refcount_read(&conn->ref)); trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref), rxrpc_conn_free); kfree(conn); if (atomic_dec_and_test(&rxnet->nr_conns)) wake_up_var(&rxnet->nr_conns); } /* * Clean up a dead connection. */ static void rxrpc_clean_up_connection(struct work_struct *work) { struct rxrpc_connection *conn = container_of(work, struct rxrpc_connection, destructor); struct rxrpc_net *rxnet = conn->rxnet; ASSERT(!conn->channels[0].call && !conn->channels[1].call && !conn->channels[2].call && !conn->channels[3].call); ASSERT(list_empty(&conn->cache_link)); timer_delete_sync(&conn->timer); cancel_work_sync(&conn->processor); /* Processing may restart the timer */ timer_delete_sync(&conn->timer); write_lock(&rxnet->conn_lock); list_del_init(&conn->proc_link); write_unlock(&rxnet->conn_lock); if (conn->pmtud_probe) { trace_rxrpc_pmtud_lost(conn, 0); conn->peer->pmtud_probing = false; conn->peer->pmtud_pending = true; } rxrpc_purge_queue(&conn->rx_queue); rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response); rxrpc_kill_client_conn(conn); conn->security->clear(conn); key_put(conn->key); rxrpc_put_bundle(conn->bundle, rxrpc_bundle_put_conn); rxrpc_put_peer(conn->peer, rxrpc_peer_put_conn); rxrpc_put_local(conn->local, rxrpc_local_put_kill_conn); /* Drain the Rx queue. Note that even though we've unpublished, an * incoming packet could still be being added to our Rx queue, so we * will need to drain it again in the RCU cleanup handler. */ rxrpc_purge_queue(&conn->rx_queue); page_frag_cache_drain(&conn->tx_data_alloc); call_rcu(&conn->rcu, rxrpc_rcu_free_connection); } /* * Drop a ref on a connection. */ void rxrpc_put_connection(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { unsigned int debug_id; bool dead; int r; if (!conn) return; debug_id = conn->debug_id; dead = __refcount_dec_and_test(&conn->ref, &r); trace_rxrpc_conn(debug_id, r - 1, why); if (dead) { timer_delete(&conn->timer); cancel_work(&conn->processor); if (in_softirq() || work_busy(&conn->processor) || timer_pending(&conn->timer)) /* Can't use the rxrpc workqueue as we need to cancel/flush * something that may be running/waiting there. */ schedule_work(&conn->destructor); else rxrpc_clean_up_connection(&conn->destructor); } } /* * reap dead service connections */ void rxrpc_service_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; struct rxrpc_net *rxnet = container_of(work, struct rxrpc_net, service_conn_reaper); unsigned long expire_at, earliest, idle_timestamp, now; int active; LIST_HEAD(graveyard); _enter(""); now = jiffies; earliest = now + MAX_JIFFY_OFFSET; write_lock(&rxnet->conn_lock); list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { ASSERTCMP(atomic_read(&conn->active), >=, 0); if (likely(atomic_read(&conn->active) > 0)) continue; if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) continue; if (rxnet->live && !conn->local->dead) { idle_timestamp = READ_ONCE(conn->idle_timestamp); expire_at = idle_timestamp + rxrpc_connection_expiry * HZ; if (conn->local->service_closed) expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ; _debug("reap CONN %d { a=%d,t=%ld }", conn->debug_id, atomic_read(&conn->active), (long)expire_at - (long)now); if (time_before(now, expire_at)) { if (time_before(expire_at, earliest)) earliest = expire_at; continue; } } /* The activity count sits at 0 whilst the conn is unused on * the list; we reduce that to -1 to make the conn unavailable. */ active = 0; if (!atomic_try_cmpxchg(&conn->active, &active, -1)) continue; rxrpc_see_connection(conn, rxrpc_conn_see_reap_service); if (rxrpc_conn_is_client(conn)) BUG(); else rxrpc_unpublish_service_conn(conn); list_move_tail(&conn->link, &graveyard); } write_unlock(&rxnet->conn_lock); if (earliest != now + MAX_JIFFY_OFFSET) { _debug("reschedule reaper %ld", (long)earliest - (long)now); ASSERT(time_after(earliest, now)); rxrpc_set_service_reap_timer(rxnet, earliest); } while (!list_empty(&graveyard)) { conn = list_entry(graveyard.next, struct rxrpc_connection, link); list_del_init(&conn->link); ASSERTCMP(atomic_read(&conn->active), ==, -1); rxrpc_put_connection(conn, rxrpc_conn_put_service_reaped); } _leave(""); } /* * preemptively destroy all the service connection records rather than * waiting for them to time out */ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) { struct rxrpc_connection *conn, *_p; bool leak = false; _enter(""); atomic_dec(&rxnet->nr_conns); timer_delete_sync(&rxnet->service_conn_reap_timer); rxrpc_queue_work(&rxnet->service_conn_reaper); flush_workqueue(rxrpc_workqueue); write_lock(&rxnet->conn_lock); list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { pr_err("AF_RXRPC: Leaked conn %p {%d}\n", conn, refcount_read(&conn->ref)); leak = true; } write_unlock(&rxnet->conn_lock); BUG_ON(leak); ASSERT(list_empty(&rxnet->conn_proc_list)); /* We need to wait for the connections to be destroyed by RCU as they * pin things that we still need to get rid of. */ wait_var_event(&rxnet->nr_conns, !atomic_read(&rxnet->nr_conns)); _leave(""); }
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PFXLEN_H #define _PFXLEN_H #include <asm/byteorder.h> #include <linux/netfilter.h> #include <net/tcp.h> /* Prefixlen maps, by Jan Engelhardt */ extern const union nf_inet_addr ip_set_netmask_map[]; extern const union nf_inet_addr ip_set_hostmask_map[]; static inline __be32 ip_set_netmask(u8 pfxlen) { return ip_set_netmask_map[pfxlen].ip; } static inline const __be32 * ip_set_netmask6(u8 pfxlen) { return &ip_set_netmask_map[pfxlen].ip6[0]; } static inline u32 ip_set_hostmask(u8 pfxlen) { return (__force u32) ip_set_hostmask_map[pfxlen].ip; } static inline const __be32 * ip_set_hostmask6(u8 pfxlen) { return &ip_set_hostmask_map[pfxlen].ip6[0]; } extern u32 ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr); #define ip_set_mask_from_to(from, to, cidr) \ do { \ from &= ip_set_hostmask(cidr); \ to = from | ~ip_set_hostmask(cidr); \ } while (0) static inline void ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip->ip6[0] &= ip_set_netmask6(prefix)[0]; ip->ip6[1] &= ip_set_netmask6(prefix)[1]; ip->ip6[2] &= ip_set_netmask6(prefix)[2]; ip->ip6[3] &= ip_set_netmask6(prefix)[3]; } #endif /*_PFXLEN_H */
15 2 15 15 2 14 15 15 14 13 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/cdev.h> #include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <rdma/ib.h> #include <rdma/uverbs_std_types.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_ucaps.h> #include "uverbs.h" #include "core_priv.h" #include "rdma_core.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace verbs access"); MODULE_LICENSE("Dual BSD/GPL"); enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS, IB_UVERBS_NUM_FIXED_MINOR = 32, IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR, }; #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) static dev_t dynamic_uverbs_dev; static DEFINE_IDA(uverbs_ida); static int ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); static struct ib_client uverbs_client; static char *uverbs_devnode(const struct device *dev, umode_t *mode) { if (mode) *mode = 0666; return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } static const struct class uverbs_class = { .name = "infiniband_verbs", .devnode = uverbs_devnode, }; /* * Must be called with the ufile->device->disassociate_srcu held, and the lock * must be held until use of the ucontext is finished. */ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile) { /* * We do not hold the hw_destroy_rwsem lock for this flow, instead * srcu is used. It does not matter if someone races this with * get_context, we get NULL or valid ucontext. */ struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext); if (!srcu_dereference(ufile->device->ib_dev, &ufile->device->disassociate_srcu)) return ERR_PTR(-EIO); if (!ucontext) return ERR_PTR(-EINVAL); return ucontext; } EXPORT_SYMBOL(ib_uverbs_get_ucontext_file); int uverbs_dealloc_mw(struct ib_mw *mw) { struct ib_pd *pd = mw->pd; int ret; ret = mw->device->ops.dealloc_mw(mw); if (ret) return ret; atomic_dec(&pd->usecnt); kfree(mw); return ret; } static void ib_uverbs_release_dev(struct device *device) { struct ib_uverbs_device *dev = container_of(device, struct ib_uverbs_device, dev); uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); mutex_destroy(&dev->lists_mutex); mutex_destroy(&dev->xrcd_tree_mutex); kfree(dev); } void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, struct ib_ucq_object *uobj) { struct ib_uverbs_event *evt, *tmp; if (ev_file) { spin_lock_irq(&ev_file->ev_queue.lock); list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) { list_del(&evt->list); kfree(evt); } spin_unlock_irq(&ev_file->ev_queue.lock); uverbs_uobject_put(&ev_file->uobj); } ib_uverbs_release_uevent(&uobj->uevent); } void ib_uverbs_release_uevent(struct ib_uevent_object *uobj) { struct ib_uverbs_async_event_file *async_file = uobj->event_file; struct ib_uverbs_event *evt, *tmp; if (!async_file) return; spin_lock_irq(&async_file->ev_queue.lock); list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) { list_del(&evt->list); kfree(evt); } spin_unlock_irq(&async_file->ev_queue.lock); uverbs_uobject_put(&async_file->uobj); } void ib_uverbs_detach_umcast(struct ib_qp *qp, struct ib_uqp_object *uobj) { struct ib_uverbs_mcast_entry *mcast, *tmp; list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) { ib_detach_mcast(qp, &mcast->gid, mcast->lid); list_del(&mcast->list); kfree(mcast); } } static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) { complete(&dev->comp); } void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); struct ib_device *ib_dev; int srcu_key; release_ufile_idr_uobject(file); srcu_key = srcu_read_lock(&file->device->disassociate_srcu); ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); if (ib_dev && !ib_dev->ops.disassociate_ucontext) module_put(ib_dev->ops.owner); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); if (refcount_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); if (file->default_async_file) uverbs_uobject_put(&file->default_async_file->uobj); put_device(&file->device->dev); if (file->disassociate_page) __free_pages(file->disassociate_page, 0); mutex_destroy(&file->disassociation_lock); mutex_destroy(&file->umap_lock); mutex_destroy(&file->ucontext_lock); kfree(file); } static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, struct file *filp, char __user *buf, size_t count, loff_t *pos, size_t eventsz) { struct ib_uverbs_event *event; int ret = 0; spin_lock_irq(&ev_queue->lock); while (list_empty(&ev_queue->event_list)) { if (ev_queue->is_closed) { spin_unlock_irq(&ev_queue->lock); return -EIO; } spin_unlock_irq(&ev_queue->lock); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(ev_queue->poll_wait, (!list_empty(&ev_queue->event_list) || ev_queue->is_closed))) return -ERESTARTSYS; spin_lock_irq(&ev_queue->lock); } event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list); if (eventsz > count) { ret = -EINVAL; event = NULL; } else { list_del(ev_queue->event_list.next); if (event->counter) { ++(*event->counter); list_del(&event->obj_list); } } spin_unlock_irq(&ev_queue->lock); if (event) { if (copy_to_user(buf, event, eventsz)) ret = -EFAULT; else ret = eventsz; } kfree(event); return ret; } static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_async_event_file *file = filp->private_data; return ib_uverbs_event_read(&file->ev_queue, filp, buf, count, pos, sizeof(struct ib_uverbs_async_event_desc)); } static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_completion_event_file *comp_ev_file = filp->private_data; return ib_uverbs_event_read(&comp_ev_file->ev_queue, filp, buf, count, pos, sizeof(struct ib_uverbs_comp_event_desc)); } static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue, struct file *filp, struct poll_table_struct *wait) { __poll_t pollflags = 0; poll_wait(filp, &ev_queue->poll_wait, wait); spin_lock_irq(&ev_queue->lock); if (!list_empty(&ev_queue->event_list)) pollflags = EPOLLIN | EPOLLRDNORM; else if (ev_queue->is_closed) pollflags = EPOLLERR; spin_unlock_irq(&ev_queue->lock); return pollflags; } static __poll_t ib_uverbs_async_event_poll(struct file *filp, struct poll_table_struct *wait) { struct ib_uverbs_async_event_file *file = filp->private_data; return ib_uverbs_event_poll(&file->ev_queue, filp, wait); } static __poll_t ib_uverbs_comp_event_poll(struct file *filp, struct poll_table_struct *wait) { struct ib_uverbs_completion_event_file *comp_ev_file = filp->private_data; return ib_uverbs_event_poll(&comp_ev_file->ev_queue, filp, wait); } static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on) { struct ib_uverbs_async_event_file *file = filp->private_data; return fasync_helper(fd, filp, on, &file->ev_queue.async_queue); } static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on) { struct ib_uverbs_completion_event_file *comp_ev_file = filp->private_data; return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue); } const struct file_operations uverbs_event_fops = { .owner = THIS_MODULE, .read = ib_uverbs_comp_event_read, .poll = ib_uverbs_comp_event_poll, .release = uverbs_uobject_fd_release, .fasync = ib_uverbs_comp_event_fasync, }; const struct file_operations uverbs_async_event_fops = { .owner = THIS_MODULE, .read = ib_uverbs_async_event_read, .poll = ib_uverbs_async_event_poll, .release = uverbs_async_event_release, .fasync = ib_uverbs_async_event_fasync, }; void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) { struct ib_uverbs_event_queue *ev_queue = cq_context; struct ib_ucq_object *uobj; struct ib_uverbs_event *entry; unsigned long flags; if (!ev_queue) return; spin_lock_irqsave(&ev_queue->lock, flags); if (ev_queue->is_closed) { spin_unlock_irqrestore(&ev_queue->lock, flags); return; } entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { spin_unlock_irqrestore(&ev_queue->lock, flags); return; } uobj = cq->uobject; entry->desc.comp.cq_handle = cq->uobject->uevent.uobject.user_handle; entry->counter = &uobj->comp_events_reported; list_add_tail(&entry->list, &ev_queue->event_list); list_add_tail(&entry->obj_list, &uobj->comp_list); spin_unlock_irqrestore(&ev_queue->lock, flags); wake_up_interruptible(&ev_queue->poll_wait); kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN); } void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, __u64 element, __u64 event, struct list_head *obj_list, u32 *counter) { struct ib_uverbs_event *entry; unsigned long flags; if (!async_file) return; spin_lock_irqsave(&async_file->ev_queue.lock, flags); if (async_file->ev_queue.is_closed) { spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); return; } entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); return; } entry->desc.async.element = element; entry->desc.async.event_type = event; entry->desc.async.reserved = 0; entry->counter = counter; list_add_tail(&entry->list, &async_file->ev_queue.event_list); if (obj_list) list_add_tail(&entry->obj_list, obj_list); spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); wake_up_interruptible(&async_file->ev_queue.poll_wait); kill_fasync(&async_file->ev_queue.async_queue, SIGIO, POLL_IN); } static void uverbs_uobj_event(struct ib_uevent_object *eobj, struct ib_event *event) { ib_uverbs_async_handler(eobj->event_file, eobj->uobject.user_handle, event->event, &eobj->event_list, &eobj->events_reported); } void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) { uverbs_uobj_event(&event->element.cq->uobject->uevent, event); } void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) { /* for XRC target qp's, check that qp is live */ if (!event->element.qp->uobject) return; uverbs_uobj_event(&event->element.qp->uobject->uevent, event); } void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr) { uverbs_uobj_event(&event->element.wq->uobject->uevent, event); } void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) { uverbs_uobj_event(&event->element.srq->uobject->uevent, event); } static void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event) { ib_uverbs_async_handler( container_of(handler, struct ib_uverbs_async_event_file, event_handler), event->element.port_num, event->event, NULL, NULL); } void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue) { spin_lock_init(&ev_queue->lock); INIT_LIST_HEAD(&ev_queue->event_list); init_waitqueue_head(&ev_queue->poll_wait); ev_queue->is_closed = 0; ev_queue->async_queue = NULL; } void ib_uverbs_init_async_event_file( struct ib_uverbs_async_event_file *async_file) { struct ib_uverbs_file *uverbs_file = async_file->uobj.ufile; struct ib_device *ib_dev = async_file->uobj.context->device; ib_uverbs_init_event_queue(&async_file->ev_queue); /* The first async_event_file becomes the default one for the file. */ mutex_lock(&uverbs_file->ucontext_lock); if (!uverbs_file->default_async_file) { /* Pairs with the put in ib_uverbs_release_file */ uverbs_uobject_get(&async_file->uobj); smp_store_release(&uverbs_file->default_async_file, async_file); } mutex_unlock(&uverbs_file->ucontext_lock); INIT_IB_EVENT_HANDLER(&async_file->event_handler, ib_dev, ib_uverbs_event_handler); ib_register_event_handler(&async_file->event_handler); } static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr, struct ib_uverbs_ex_cmd_hdr *ex_hdr, size_t count, const struct uverbs_api_write_method *method_elm) { if (method_elm->is_ex) { count -= sizeof(*hdr) + sizeof(*ex_hdr); if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count) return -EINVAL; if (hdr->in_words * 8 < method_elm->req_size) return -ENOSPC; if (ex_hdr->cmd_hdr_reserved) return -EINVAL; if (ex_hdr->response) { if (!hdr->out_words && !ex_hdr->provider_out_words) return -EINVAL; if (hdr->out_words * 8 < method_elm->resp_size) return -ENOSPC; if (!access_ok(u64_to_user_ptr(ex_hdr->response), (hdr->out_words + ex_hdr->provider_out_words) * 8)) return -EFAULT; } else { if (hdr->out_words || ex_hdr->provider_out_words) return -EINVAL; } return 0; } /* not extended command */ if (hdr->in_words * 4 != count) return -EINVAL; if (count < method_elm->req_size + sizeof(*hdr)) { /* * rdma-core v18 and v19 have a bug where they send DESTROY_CQ * with a 16 byte write instead of 24. Old kernels didn't * check the size so they allowed this. Now that the size is * checked provide a compatibility work around to not break * those userspaces. */ if (hdr->command == IB_USER_VERBS_CMD_DESTROY_CQ && count == 16) { hdr->in_words = 6; return 0; } return -ENOSPC; } if (hdr->out_words * 4 < method_elm->resp_size) return -ENOSPC; return 0; } static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; const struct uverbs_api_write_method *method_elm; struct uverbs_api *uapi = file->device->uapi; struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_uverbs_cmd_hdr hdr; struct uverbs_attr_bundle bundle; int srcu_key; ssize_t ret; if (!ib_safe_file_access(filp)) { pr_err_once("uverbs_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", task_tgid_vnr(current), current->comm); return -EACCES; } if (count < sizeof(hdr)) return -EINVAL; if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; method_elm = uapi_get_method(uapi, hdr.command); if (IS_ERR(method_elm)) return PTR_ERR(method_elm); if (method_elm->is_ex) { if (count < (sizeof(hdr) + sizeof(ex_hdr))) return -EINVAL; if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) return -EFAULT; } ret = verify_hdr(&hdr, &ex_hdr, count, method_elm); if (ret) return ret; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); buf += sizeof(hdr); memset(bundle.attr_present, 0, sizeof(bundle.attr_present)); bundle.ufile = file; bundle.context = NULL; /* only valid if bundle has uobject */ bundle.uobject = NULL; if (!method_elm->is_ex) { size_t in_len = hdr.in_words * 4 - sizeof(hdr); size_t out_len = hdr.out_words * 4; u64 response = 0; if (method_elm->has_udata) { bundle.driver_udata.inlen = in_len - method_elm->req_size; in_len = method_elm->req_size; if (bundle.driver_udata.inlen) bundle.driver_udata.inbuf = buf + in_len; else bundle.driver_udata.inbuf = NULL; } else { memset(&bundle.driver_udata, 0, sizeof(bundle.driver_udata)); } if (method_elm->has_resp) { /* * The macros check that if has_resp is set * then the command request structure starts * with a '__aligned u64 response' member. */ ret = get_user(response, (const u64 __user *)buf); if (ret) goto out_unlock; if (method_elm->has_udata) { bundle.driver_udata.outlen = out_len - method_elm->resp_size; out_len = method_elm->resp_size; if (bundle.driver_udata.outlen) bundle.driver_udata.outbuf = u64_to_user_ptr(response + out_len); else bundle.driver_udata.outbuf = NULL; } } else { bundle.driver_udata.outlen = 0; bundle.driver_udata.outbuf = NULL; } ib_uverbs_init_udata_buf_or_null( &bundle.ucore, buf, u64_to_user_ptr(response), in_len, out_len); } else { buf += sizeof(ex_hdr); ib_uverbs_init_udata_buf_or_null(&bundle.ucore, buf, u64_to_user_ptr(ex_hdr.response), hdr.in_words * 8, hdr.out_words * 8); ib_uverbs_init_udata_buf_or_null( &bundle.driver_udata, buf + bundle.ucore.inlen, u64_to_user_ptr(ex_hdr.response) + bundle.ucore.outlen, ex_hdr.provider_in_words * 8, ex_hdr.provider_out_words * 8); } ret = method_elm->handler(&bundle); if (bundle.uobject) uverbs_finalize_object(bundle.uobject, UVERBS_ACCESS_NEW, true, !ret, &bundle); out_unlock: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return (ret) ? : count; } static const struct vm_operations_struct rdma_umap_ops; static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) { struct ib_uverbs_file *file = filp->private_data; struct ib_ucontext *ucontext; int ret = 0; int srcu_key; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); ucontext = ib_uverbs_get_ucontext_file(file); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); goto out; } mutex_lock(&file->disassociation_lock); vma->vm_ops = &rdma_umap_ops; ret = ucontext->device->ops.mmap(ucontext, vma); mutex_unlock(&file->disassociation_lock); out: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; } /* * The VMA has been dup'd, initialize the vm_private_data with a new tracking * struct */ static void rdma_umap_open(struct vm_area_struct *vma) { struct ib_uverbs_file *ufile = vma->vm_file->private_data; struct rdma_umap_priv *opriv = vma->vm_private_data; struct rdma_umap_priv *priv; if (!opriv) return; /* We are racing with disassociation */ if (!down_read_trylock(&ufile->hw_destroy_rwsem)) goto out_zap; mutex_lock(&ufile->disassociation_lock); /* * Disassociation already completed, the VMA should already be zapped. */ if (!ufile->ucontext) goto out_unlock; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) goto out_unlock; rdma_umap_priv_init(priv, vma, opriv->entry); mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); return; out_unlock: mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); out_zap: /* * We can't allow the VMA to be created with the actual IO pages, that * would break our API contract, and it can't be stopped at this * point, so zap it. */ vma->vm_private_data = NULL; zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); } static void rdma_umap_close(struct vm_area_struct *vma) { struct ib_uverbs_file *ufile = vma->vm_file->private_data; struct rdma_umap_priv *priv = vma->vm_private_data; if (!priv) return; /* * The vma holds a reference on the struct file that created it, which * in turn means that the ib_uverbs_file is guaranteed to exist at * this point. */ mutex_lock(&ufile->umap_lock); if (priv->entry) rdma_user_mmap_entry_put(priv->entry); list_del(&priv->list); mutex_unlock(&ufile->umap_lock); kfree(priv); } /* * Once the zap_vma_ptes has been called touches to the VMA will come here and * we return a dummy writable zero page for all the pfns. */ static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) { struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data; struct rdma_umap_priv *priv = vmf->vma->vm_private_data; vm_fault_t ret = 0; if (!priv) return VM_FAULT_SIGBUS; /* Read only pages can just use the system zero page. */ if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { vmf->page = ZERO_PAGE(vmf->address); get_page(vmf->page); return 0; } mutex_lock(&ufile->umap_lock); if (!ufile->disassociate_page) ufile->disassociate_page = alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0); if (ufile->disassociate_page) { /* * This VMA is forced to always be shared so this doesn't have * to worry about COW. */ vmf->page = ufile->disassociate_page; get_page(vmf->page); } else { ret = VM_FAULT_SIGBUS; } mutex_unlock(&ufile->umap_lock); return ret; } static const struct vm_operations_struct rdma_umap_ops = { .open = rdma_umap_open, .close = rdma_umap_close, .fault = rdma_umap_fault, }; void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) { struct rdma_umap_priv *priv, *next_priv; mutex_lock(&ufile->disassociation_lock); while (1) { struct mm_struct *mm = NULL; /* Get an arbitrary mm pointer that hasn't been cleaned yet */ mutex_lock(&ufile->umap_lock); while (!list_empty(&ufile->umaps)) { int ret; priv = list_first_entry(&ufile->umaps, struct rdma_umap_priv, list); mm = priv->vma->vm_mm; ret = mmget_not_zero(mm); if (!ret) { list_del_init(&priv->list); if (priv->entry) { rdma_user_mmap_entry_put(priv->entry); priv->entry = NULL; } mm = NULL; continue; } break; } mutex_unlock(&ufile->umap_lock); if (!mm) { mutex_unlock(&ufile->disassociation_lock); return; } /* * The umap_lock is nested under mmap_lock since it used within * the vma_ops callbacks, so we have to clean the list one mm * at a time to get the lock ordering right. Typically there * will only be one mm, so no big deal. */ mmap_read_lock(mm); mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { struct vm_area_struct *vma = priv->vma; if (vma->vm_mm != mm) continue; list_del_init(&priv->list); zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); if (priv->entry) { rdma_user_mmap_entry_put(priv->entry); priv->entry = NULL; } } mutex_unlock(&ufile->umap_lock); mmap_read_unlock(mm); mmput(mm); } mutex_unlock(&ufile->disassociation_lock); } /** * rdma_user_mmap_disassociate() - Revoke mmaps for a device * @device: device to revoke * * This function should be called by drivers that need to disable mmaps for the * device, for instance because it is going to be reset. */ void rdma_user_mmap_disassociate(struct ib_device *device) { struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); struct ib_uverbs_file *ufile; mutex_lock(&uverbs_dev->lists_mutex); list_for_each_entry(ufile, &uverbs_dev->uverbs_file_list, list) { if (ufile->ucontext) uverbs_user_mmap_disassociate(ufile); } mutex_unlock(&uverbs_dev->lists_mutex); } EXPORT_SYMBOL(rdma_user_mmap_disassociate); /* * ib_uverbs_open() does not need the BKL: * * - the ib_uverbs_device structures are properly reference counted and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - there is no ioctl method to race against; * - the open method will either immediately run -ENXIO, or all * required initialization will be done. */ static int ib_uverbs_open(struct inode *inode, struct file *filp) { struct ib_uverbs_device *dev; struct ib_uverbs_file *file; struct ib_device *ib_dev; int ret; int module_dependent; int srcu_key; dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); if (!refcount_inc_not_zero(&dev->refcount)) return -ENXIO; get_device(&dev->dev); srcu_key = srcu_read_lock(&dev->disassociate_srcu); mutex_lock(&dev->lists_mutex); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (!ib_dev) { ret = -EIO; goto err; } if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) { ret = -EPERM; goto err; } /* In case IB device supports disassociate ucontext, there is no hard * dependency between uverbs device and its low level device. */ module_dependent = !(ib_dev->ops.disassociate_ucontext); if (module_dependent) { if (!try_module_get(ib_dev->ops.owner)) { ret = -ENODEV; goto err; } } file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) { ret = -ENOMEM; if (module_dependent) goto err_module; goto err; } file->device = dev; kref_init(&file->ref); mutex_init(&file->ucontext_lock); spin_lock_init(&file->uobjects_lock); INIT_LIST_HEAD(&file->uobjects); init_rwsem(&file->hw_destroy_rwsem); mutex_init(&file->umap_lock); INIT_LIST_HEAD(&file->umaps); mutex_init(&file->disassociation_lock); filp->private_data = file; list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); setup_ufile_idr_uobject(file); return stream_open(inode, filp); err_module: module_put(ib_dev->ops.owner); err: mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); if (refcount_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); put_device(&dev->dev); return ret; } static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE); mutex_lock(&file->device->lists_mutex); list_del_init(&file->list); mutex_unlock(&file->device->lists_mutex); kref_put(&file->ref, ib_uverbs_release_file); return 0; } static const struct file_operations uverbs_fops = { .owner = THIS_MODULE, .write = ib_uverbs_write, .open = ib_uverbs_open, .release = ib_uverbs_close, .unlocked_ioctl = ib_uverbs_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static const struct file_operations uverbs_mmap_fops = { .owner = THIS_MODULE, .write = ib_uverbs_write, .mmap = ib_uverbs_mmap, .open = ib_uverbs_open, .release = ib_uverbs_close, .unlocked_ioctl = ib_uverbs_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res) { struct ib_uverbs_device *uverbs_dev = client_data; int ret; if (res->port != -1) return -EINVAL; res->abi = ibdev->ops.uverbs_abi_ver; res->cdev = &uverbs_dev->dev; /* * To support DRIVER_ID binding in userspace some of the driver need * upgrading to expose their PCI dependent revision information * through get_context instead of relying on modalias matching. When * the drivers are fixed they can drop this flag. */ if (!ibdev->ops.uverbs_no_driver_id_binding) { ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, ibdev->ops.driver_id); if (ret) return ret; } return 0; } static struct ib_client uverbs_client = { .name = "uverbs", .no_kverbs_req = true, .add = ib_uverbs_add_one, .remove = ib_uverbs_remove_one, .get_nl_info = ib_uverbs_get_nl_info, }; MODULE_ALIAS_RDMA_CLIENT("uverbs"); static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; struct ib_device *ib_dev; srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) ret = sysfs_emit(buf, "%s\n", dev_name(&ib_dev->dev)); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; } static DEVICE_ATTR_RO(ibdev); static ssize_t abi_version_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; struct ib_device *ib_dev; srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) ret = sysfs_emit(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; } static DEVICE_ATTR_RO(abi_version); static struct attribute *ib_dev_attrs[] = { &dev_attr_abi_version.attr, &dev_attr_ibdev.attr, NULL, }; static const struct attribute_group dev_attr_group = { .attrs = ib_dev_attrs, }; static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); static int ib_uverbs_create_uapi(struct ib_device *device, struct ib_uverbs_device *uverbs_dev) { struct uverbs_api *uapi; uapi = uverbs_alloc_api(device); if (IS_ERR(uapi)) return PTR_ERR(uapi); uverbs_dev->uapi = uapi; return 0; } static int ib_uverbs_add_one(struct ib_device *device) { int devnum; dev_t base; struct ib_uverbs_device *uverbs_dev; int ret; if (!device->ops.alloc_ucontext || device->type == RDMA_DEVICE_TYPE_SMI) return -EOPNOTSUPP; uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); if (!uverbs_dev) return -ENOMEM; ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); if (ret) { kfree(uverbs_dev); return -ENOMEM; } device_initialize(&uverbs_dev->dev); uverbs_dev->dev.class = &uverbs_class; uverbs_dev->dev.parent = device->dev.parent; uverbs_dev->dev.release = ib_uverbs_release_dev; uverbs_dev->groups[0] = &dev_attr_group; uverbs_dev->dev.groups = uverbs_dev->groups; refcount_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, GFP_KERNEL); if (devnum < 0) { ret = -ENOMEM; goto err; } uverbs_dev->devnum = devnum; if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else base = IB_UVERBS_BASE_DEV + devnum; ret = ib_uverbs_create_uapi(device, uverbs_dev); if (ret) goto err_uapi; uverbs_dev->dev.devt = base; dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); cdev_init(&uverbs_dev->cdev, device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops); uverbs_dev->cdev.owner = THIS_MODULE; ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); if (ret) goto err_uapi; ib_set_client_data(device, &uverbs_client, uverbs_dev); return 0; err_uapi: ida_free(&uverbs_ida, devnum); err: if (refcount_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); put_device(&uverbs_dev->dev); return ret; } static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, struct ib_device *ib_dev) { struct ib_uverbs_file *file; /* Pending running commands to terminate */ uverbs_disassociate_api_pre(uverbs_dev); mutex_lock(&uverbs_dev->lists_mutex); while (!list_empty(&uverbs_dev->uverbs_file_list)) { file = list_first_entry(&uverbs_dev->uverbs_file_list, struct ib_uverbs_file, list); list_del_init(&file->list); kref_get(&file->ref); /* We must release the mutex before going ahead and calling * uverbs_cleanup_ufile, as it might end up indirectly calling * uverbs_close, for example due to freeing the resources (e.g * mmput). */ mutex_unlock(&uverbs_dev->lists_mutex); uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE); kref_put(&file->ref, ib_uverbs_release_file); mutex_lock(&uverbs_dev->lists_mutex); } mutex_unlock(&uverbs_dev->lists_mutex); uverbs_disassociate_api(uverbs_dev->uapi); } static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) { struct ib_uverbs_device *uverbs_dev = client_data; int wait_clients = 1; cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); ida_free(&uverbs_ida, uverbs_dev->devnum); if (device->ops.disassociate_ucontext) { /* We disassociate HW resources and immediately return. * Userspace will see a EIO errno for all future access. * Upon returning, ib_device may be freed internally and is not * valid any more. * uverbs_device is still available until all clients close * their files, then the uverbs device ref count will be zero * and its resources will be freed. * Note: At this point no more files can be opened since the * cdev was deleted, however active clients can still issue * commands and close their open files. */ ib_uverbs_free_hw_resources(uverbs_dev, device); wait_clients = 0; } if (refcount_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); put_device(&uverbs_dev->dev); } static int __init ib_uverbs_init(void) { int ret; ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_NUM_FIXED_MINOR, "infiniband_verbs"); if (ret) { pr_err("user_verbs: couldn't register device number\n"); goto out; } ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0, IB_UVERBS_NUM_DYNAMIC_MINOR, "infiniband_verbs"); if (ret) { pr_err("couldn't register dynamic device number\n"); goto out_alloc; } ret = class_register(&uverbs_class); if (ret) { pr_err("user_verbs: couldn't create class infiniband_verbs\n"); goto out_chrdev; } ret = class_create_file(&uverbs_class, &class_attr_abi_version.attr); if (ret) { pr_err("user_verbs: couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&uverbs_client); if (ret) { pr_err("user_verbs: couldn't register client\n"); goto out_class; } return 0; out_class: class_unregister(&uverbs_class); out_chrdev: unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); out_alloc: unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_NUM_FIXED_MINOR); out: return ret; } static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); class_unregister(&uverbs_class); unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); ib_cleanup_ucaps(); mmu_notifier_synchronize(); } module_init(ib_uverbs_init); module_exit(ib_uverbs_cleanup);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 // SPDX-License-Identifier: GPL-2.0-only /* * xt_ipvs - kernel module to match IPVS connection properties * * Author: Hannes Eder <heder@google.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #endif #include <linux/ip_vs.h> #include <linux/types.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_ipvs.h> #include <net/netfilter/nf_conntrack.h> #include <net/ip_vs.h> MODULE_AUTHOR("Hannes Eder <heder@google.com>"); MODULE_DESCRIPTION("Xtables: match IPVS connection properties"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_ipvs"); MODULE_ALIAS("ip6t_ipvs"); /* borrowed from xt_conntrack */ static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr, const union nf_inet_addr *uaddr, const union nf_inet_addr *umask, unsigned int l3proto) { if (l3proto == NFPROTO_IPV4) return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; #ifdef CONFIG_IP_VS_IPV6 else if (l3proto == NFPROTO_IPV6) return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, &uaddr->in6) == 0; #endif else return false; } static bool ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_ipvs_mtinfo *data = par->matchinfo; struct netns_ipvs *ipvs = net_ipvs(xt_net(par)); /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ const u_int8_t family = xt_family(par); struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; bool match = true; if (data->bitmask == XT_IPVS_IPVS_PROPERTY) { match = skb->ipvs_property ^ !!(data->invert & XT_IPVS_IPVS_PROPERTY); goto out; } /* other flags than XT_IPVS_IPVS_PROPERTY are set */ if (!skb->ipvs_property) { match = false; goto out; } ip_vs_fill_iph_skb(family, skb, true, &iph); if (data->bitmask & XT_IPVS_PROTO) if ((iph.protocol == data->l4proto) ^ !(data->invert & XT_IPVS_PROTO)) { match = false; goto out; } pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) { match = false; goto out; } /* * Check if the packet belongs to an existing entry */ cp = pp->conn_out_get(ipvs, family, skb, &iph); if (unlikely(cp == NULL)) { match = false; goto out; } /* * We found a connection, i.e. ct != 0, make sure to call * __ip_vs_conn_put before returning. In our case jump to out_put_con. */ if (data->bitmask & XT_IPVS_VPORT) if ((cp->vport == data->vport) ^ !(data->invert & XT_IPVS_VPORT)) { match = false; goto out_put_cp; } if (data->bitmask & XT_IPVS_VPORTCTL) if ((cp->control != NULL && cp->control->vport == data->vportctl) ^ !(data->invert & XT_IPVS_VPORTCTL)) { match = false; goto out_put_cp; } if (data->bitmask & XT_IPVS_DIR) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) { match = false; goto out_put_cp; } if ((ctinfo >= IP_CT_IS_REPLY) ^ !!(data->invert & XT_IPVS_DIR)) { match = false; goto out_put_cp; } } if (data->bitmask & XT_IPVS_METHOD) if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^ !(data->invert & XT_IPVS_METHOD)) { match = false; goto out_put_cp; } if (data->bitmask & XT_IPVS_VADDR) { if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr, &data->vmask, family) ^ !(data->invert & XT_IPVS_VADDR)) { match = false; goto out_put_cp; } } out_put_cp: __ip_vs_conn_put(cp); out: pr_debug("match=%d\n", match); return match; } static int ipvs_mt_check(const struct xt_mtchk_param *par) { if (par->family != NFPROTO_IPV4 #ifdef CONFIG_IP_VS_IPV6 && par->family != NFPROTO_IPV6 #endif ) { pr_info_ratelimited("protocol family %u not supported\n", par->family); return -EINVAL; } return 0; } static struct xt_match xt_ipvs_mt_reg __read_mostly = { .name = "ipvs", .revision = 0, .family = NFPROTO_UNSPEC, .match = ipvs_mt, .checkentry = ipvs_mt_check, .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)), .me = THIS_MODULE, }; static int __init ipvs_mt_init(void) { return xt_register_match(&xt_ipvs_mt_reg); } static void __exit ipvs_mt_exit(void) { xt_unregister_match(&xt_ipvs_mt_reg); } module_init(ipvs_mt_init); module_exit(ipvs_mt_exit);
1213 1216 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if_arp.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> #include <linux/export.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct sk_buff *skb) { struct vport *vport; vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if (skb->dev->type == ARPHRD_ETHER) skb_push_rcsum(skb, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: kfree_skb(skb); } /* Called with rcu_read_lock and bottom-halves disabled. */ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } static struct net_device *get_dpdev(const struct datapath *dp) { struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); return local->dev; } struct vport *ovs_netdev_link(struct vport *vport, const char *name) { int err; vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); if (!vport->dev) { err = -ENODEV; goto error_free_vport; } /* Ensure that the device exists and that the provided * name is not one of its aliases. */ if (strcmp(name, ovs_vport_name(vport))) { err = -ENODEV; goto error_put; } netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && vport->dev->type != ARPHRD_NONE) || ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) goto error_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; dev_disable_lro(vport->dev); dev_set_promiscuity(vport->dev, 1); vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: netdev_put(vport->dev, &vport->dev_tracker); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; return ovs_netdev_link(vport, parms->name); } static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); netdev_put(vport->dev, &vport->dev_tracker); ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { ASSERT_RTNL(); vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(vport->dev); netdev_upper_dev_unlink(vport->dev, netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { /* When called from ovs_db_notify_wq() after a dp_device_event(), the * port has already been detached, so we can avoid taking the RTNL by * checking this first. */ if (netif_is_ovs_port(vport->dev)) { rtnl_lock(); /* Check again while holding the lock to ensure we don't race * with the netdev notifier and detach twice. */ if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); } call_rcu(&vport->rcu, vport_netdev_free); } void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and * underlying netdev deregistration; delete the link only * if it's not already shutting down. */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); vport->dev = NULL; rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else return NULL; } static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) { return ovs_vport_ops_register(&ovs_netdev_vport_ops); } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); }
133 134 9 9 11 11 20 134 134 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0 /* * Author: Andrei Vagin <avagin@openvz.org> * Author: Dmitry Safonov <dima@arista.com> */ #include <linux/time_namespace.h> #include <linux/user_namespace.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/clocksource.h> #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> #include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> #include <vdso/datapage.h> ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) { ktime_t offset; switch (clockid) { case CLOCK_MONOTONIC: offset = timespec64_to_ktime(ns_offsets->monotonic); break; case CLOCK_BOOTTIME: case CLOCK_BOOTTIME_ALARM: offset = timespec64_to_ktime(ns_offsets->boottime); break; default: return tim; } /* * Check that @tim value is in [offset, KTIME_MAX + offset] * and subtract offset. */ if (tim < offset) { /* * User can specify @tim *absolute* value - if it's lesser than * the time namespace's offset - it's already expired. */ tim = 0; } else { tim = ktime_sub(tim, offset); if (unlikely(tim > KTIME_MAX)) tim = KTIME_MAX; } return tim; } static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); } static void dec_time_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); } /** * clone_time_ns - Clone a time namespace * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * Clone @old_ns and set the clone refcount to 1 * * Return: The new namespace or ERR_PTR. */ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, struct time_namespace *old_ns) { struct time_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; ucounts = inc_time_namespaces(user_ns); if (!ucounts) goto fail; err = -ENOMEM; ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; ns_tree_add(ns); return ns; fail_free_page: __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: dec_time_namespaces(ucounts); fail: return ERR_PTR(err); } /** * copy_time_ns - Create timens_for_children from @old_ns * @flags: Cloning flags * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; * adds a refcounter to @old_ns otherwise. * * Return: timens_for_children namespace or ERR_PTR. */ struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) return get_time_ns(old_ns); return clone_time_ns(user_ns, old_ns); } static struct timens_offset offset_from_ts(struct timespec64 off) { struct timens_offset ret; ret.sec = off.tv_sec; ret.nsec = off.tv_nsec; return ret; } /* * A time namespace VVAR page has the same layout as the VVAR page which * contains the system wide VDSO data. * * For a normal task the VVAR pages are installed in the normal ordering: * VVAR * PVCLOCK * HVCLOCK * TIMENS <- Not really required * * Now for a timens task the pages are installed in the following order: * TIMENS * PVCLOCK * HVCLOCK * VVAR * * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ static void timens_setup_vdso_clock_data(struct vdso_clock *vc, struct time_namespace *ns) { struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); vc->seq = 1; vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; offset[CLOCK_BOOTTIME] = boottime; offset[CLOCK_BOOTTIME_ALARM] = boottime; } struct page *find_timens_vvar_page(struct vm_area_struct *vma) { if (likely(vma->vm_mm == current->mm)) return current->nsproxy->time_ns->vvar_page; /* * VM_PFNMAP | VM_IO protect .fault() handler from being called * through interfaces like /proc/$pid/mem or * process_vm_{readv,writev}() as long as there's no .access() * in special_mapping_vmops(). * For more details check_vma_flags() and __access_remote_vm() */ WARN(1, "vvar_page accessed remotely"); return NULL; } /* * Protects possibly multiple offsets writers racing each other * and tasks entering the namespace. */ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { struct vdso_time_data *vdata; struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) return; /* Fast-path, taken by every task in namespace except the first. */ if (likely(ns->frozen_offsets)) return; mutex_lock(&offset_lock); /* Nothing to-do: vvar_page has been already initialized. */ if (ns->frozen_offsets) goto out; ns->frozen_offsets = true; vdata = page_address(ns->vvar_page); vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) timens_setup_vdso_clock_data(&vc[i], ns); if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); } out: mutex_unlock(&offset_lock); } void free_time_ns(struct time_namespace *ns) { ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static struct ns_common *timens_for_children_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns_for_children; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void timens_put(struct ns_common *ns) { put_time_ns(to_time_ns(ns)); } void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { timens_set_vvar_page(tsk, ns); vdso_join_timens(tsk, ns); } static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); if (!current_is_single_threaded()) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; get_time_ns(ns); put_time_ns(nsproxy->time_ns_for_children); nsproxy->time_ns_for_children = ns; return 0; } void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) return; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; timens_commit(tsk, ns); } static struct user_namespace *timens_owner(struct ns_common *ns) { return to_time_ns(ns)->user_ns; } static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) { char *clock; switch (clockid) { case CLOCK_BOOTTIME: clock = "boottime"; break; case CLOCK_MONOTONIC: clock = "monotonic"; break; default: clock = "unknown"; break; } seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { struct ns_common *ns; struct time_namespace *time_ns; ns = timens_for_children_get(p); if (!ns) return; time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { struct ns_common *ns; struct time_namespace *time_ns; struct timespec64 tp; int i, err; ns = timens_for_children_get(p); if (!ns) return -ESRCH; time_ns = to_time_ns(ns); if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { put_time_ns(time_ns); return -EPERM; } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; switch (off->clockid) { case CLOCK_MONOTONIC: ktime_get_ts64(&tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(&tp); break; default: err = -EINVAL; goto out; } err = -ERANGE; if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) goto out; tp = timespec64_add(tp, off->val); /* * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) goto out; } mutex_lock(&offset_lock); if (time_ns->frozen_offsets) { err = -EACCES; goto out_unlock; } err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; struct timespec64 *offset = NULL; switch (off->clockid) { case CLOCK_MONOTONIC: offset = &time_ns->offsets.monotonic; break; case CLOCK_BOOTTIME: offset = &time_ns->offsets.boottime; break; } *offset = off->val; } out_unlock: mutex_unlock(&offset_lock); out: put_time_ns(time_ns); return err; } const struct proc_ns_operations timens_operations = { .name = "time", .get = timens_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", .get = timens_for_children_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; struct time_namespace init_time_ns = { .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, .frozen_offsets = true, }; void __init time_ns_init(void) { ns_tree_add(&init_time_ns); }
6 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 // SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <net/checksum.h> #include <net/dst_cache.h> #include <net/ip.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/lwtunnel.h> #include <net/protocol.h> #include <uapi/linux/ila.h> #include "ila.h" struct ila_lwt { struct ila_params p; struct dst_cache dst_cache; u32 connected : 1; u32 lwt_output : 1; }; static inline struct ila_lwt *ila_lwt_lwtunnel( struct lwtunnel_state *lwt) { return (struct ila_lwt *)lwt->data; } static inline struct ila_params *ila_params_lwtunnel( struct lwtunnel_state *lwt) { return &ila_lwt_lwtunnel(lwt)->p; } static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct rt6_info *rt = dst_rt6_info(orig_dst); struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate); struct dst_entry *dst; int err = -EINVAL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; if (ilwt->lwt_output) ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate), true); if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) { /* Already have a next hop address in route, no need for * dest cache route. */ return orig_dst->lwtstate->orig_output(net, sk, skb); } local_bh_disable(); dst = dst_cache_get(&ilwt->dst_cache); local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6; /* Lookup a route for the new destination. Take into * account that the base route may already have a gateway. */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = dst_dev(orig_dst)->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = -EHOSTUNREACH; dst_release(dst); goto drop; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto drop; } /* cache only if we don't create a dst reference loop */ if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); local_bh_enable(); } } skb_dst_drop(skb); skb_dst_set(skb, dst); return dst_output(net, sk, skb); drop: kfree_skb(skb); return err; } static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate); if (skb->protocol != htons(ETH_P_IPV6)) goto drop; if (!ilwt->lwt_output) ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); return dst->lwtstate->orig_input(skb); drop: kfree_skb(skb); return -EINVAL; } static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct ila_lwt *ilwt; struct ila_params *p; struct nlattr *tb[ILA_ATTR_MAX + 1]; struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; u8 ident_type = ILA_ATYPE_USE_FORMAT; u8 hook_type = ILA_HOOK_ROUTE_OUTPUT; u8 csum_mode = ILA_CSUM_NO_ACTION; bool lwt_output = true; u8 eff_ident_type; int ret; if (family != AF_INET6) return -EINVAL; ret = nla_parse_nested_deprecated(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; if (!tb[ILA_ATTR_LOCATOR]) return -EINVAL; iaddr = (struct ila_addr *)&cfg6->fc_dst; if (tb[ILA_ATTR_IDENT_TYPE]) ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]); if (ident_type == ILA_ATYPE_USE_FORMAT) { /* Infer identifier type from type field in formatted * identifier. */ if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { /* Need to have full locator and at least type field * included in destination */ return -EINVAL; } eff_ident_type = iaddr->ident.type; } else { eff_ident_type = ident_type; } switch (eff_ident_type) { case ILA_ATYPE_IID: /* Don't allow ILA for IID type */ return -EINVAL; case ILA_ATYPE_LUID: break; case ILA_ATYPE_VIRT_V4: case ILA_ATYPE_VIRT_UNI_V6: case ILA_ATYPE_VIRT_MULTI_V6: case ILA_ATYPE_NONLOCAL_ADDR: /* These ILA formats are not supported yet. */ default: return -EINVAL; } if (tb[ILA_ATTR_HOOK_TYPE]) hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]); switch (hook_type) { case ILA_HOOK_ROUTE_OUTPUT: lwt_output = true; break; case ILA_HOOK_ROUTE_INPUT: lwt_output = false; break; default: return -EINVAL; } if (tb[ILA_ATTR_CSUM_MODE]) csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); if (csum_mode == ILA_CSUM_NEUTRAL_MAP && ila_csum_neutral_set(iaddr->ident)) { /* Don't allow translation if checksum neutral bit is * configured and it's set in the SIR address. */ return -EINVAL; } newts = lwtunnel_state_alloc(sizeof(*ilwt)); if (!newts) return -ENOMEM; ilwt = ila_lwt_lwtunnel(newts); ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC); if (ret) { kfree(newts); return ret; } ilwt->lwt_output = !!lwt_output; p = ila_params_lwtunnel(newts); p->csum_mode = csum_mode; p->ident_type = ident_type; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we * know both the old locator and the new one. */ p->locator_match = iaddr->loc; ila_init_saved_csum(p); newts->type = LWTUNNEL_ENCAP_ILA; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | LWTUNNEL_STATE_INPUT_REDIRECT; if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr)) ilwt->connected = 1; *ts = newts; return 0; } static void ila_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&ila_lwt_lwtunnel(lwt)->dst_cache); } static int ila_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ila_params *p = ila_params_lwtunnel(lwtstate); struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate); if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE, ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT : ILA_HOOK_ROUTE_INPUT)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_HOOK_TYPE */ 0; } static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct ila_params *a_p = ila_params_lwtunnel(a); struct ila_params *b_p = ila_params_lwtunnel(b); return (a_p->locator.v64 != b_p->locator.v64); } static const struct lwtunnel_encap_ops ila_encap_ops = { .build_state = ila_build_state, .destroy_state = ila_destroy_state, .output = ila_output, .input = ila_input, .fill_encap = ila_fill_encap_info, .get_encap_size = ila_encap_nlsize, .cmp_encap = ila_encap_cmp, .owner = THIS_MODULE, }; int ila_lwt_init(void) { return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); } void ila_lwt_fini(void) { lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); }
5862 166 11158 299 12286 572 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for non-atomic * bit operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H #include <linux/instrumented.h> /** * ___set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike set_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static __always_inline void ___set_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___set_bit(nr, addr); } /** * ___clear_bit - Clears a bit in memory * @nr: the bit to clear * @addr: the address to start counting from * * Unlike clear_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static __always_inline void ___clear_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___clear_bit(nr, addr); } /** * ___change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * * Unlike change_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static __always_inline void ___change_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___change_bit(nr, addr); } static __always_inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr) { if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) { /* * We treat non-atomic read-write bitops a little more special. * Given the operations here only modify a single bit, assuming * non-atomicity of the writer is sufficient may be reasonable * for certain usage (and follows the permissible nature of the * assume-plain-writes-atomic rule): * 1. report read-modify-write races -> check read; * 2. do not report races with marked readers, but do report * races with unmarked readers -> check "atomic" write. */ kcsan_check_read(addr + BIT_WORD(nr), sizeof(long)); /* * Use generic write instrumentation, in case other sanitizers * or tools are enabled alongside KCSAN. */ instrument_write(addr + BIT_WORD(nr), sizeof(long)); } else { instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); } } /** * ___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static __always_inline bool ___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_set_bit(nr, addr); } /** * ___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static __always_inline bool ___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_clear_bit(nr, addr); } /** * ___test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static __always_inline bool ___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_change_bit(nr, addr); } /** * _test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool _test_bit(unsigned long nr, const volatile unsigned long *addr) { instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); return arch_test_bit(nr, addr); } /** * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool _test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) { instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); return arch_test_bit_acquire(nr, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
93 40 93 4 29 8 92 29 121 92 29 121 131 121 131 121 12 121 121 86 86 12 12 12 24 5 19 12 12 15 5 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 // SPDX-License-Identifier: GPL-2.0-or-later /* Local endpoint object management * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/ip.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/af_rxrpc.h> #include "ar-internal.h" static void rxrpc_local_rcu(struct rcu_head *); /* * Handle an ICMP/ICMP6 error turning up at the tunnel. Push it through the * usual mechanism so that it gets parsed and presented through the UDP * socket's error_report(). */ static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { if (ip_hdr(skb)->version == IPVERSION) return ip_icmp_error(sk, skb, err, port, info, payload); if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6)) return ipv6_icmp_error(sk, skb, err, port, info, payload); } /* * Set or clear the Don't Fragment flag on a socket. */ void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set) { if (set) ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); else ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DONT); } /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. * * We explicitly don't compare the RxRPC service ID as we want to reject * conflicting uses by differing services. Further, we don't want to share * addresses with different options (IPv6), so we don't compare those bits * either. */ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { long diff; diff = ((local->srx.transport_type - srx->transport_type) ?: (local->srx.transport_len - srx->transport_len) ?: (local->srx.transport.family - srx->transport.family)); if (diff != 0) return diff; switch (srx->transport.family) { case AF_INET: /* If the choice of UDP port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin.sin_port - (u16 __force)srx->transport.sin.sin_port) ?: memcmp(&local->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: /* If the choice of UDP6 port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin6.sin6_port - (u16 __force)srx->transport.sin6.sin6_port) ?: memcmp(&local->srx.transport.sin6.sin6_addr, &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)); #endif default: BUG(); } } static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_local *local = container_of(timer, struct rxrpc_local, client_conn_reap_timer); if (!local->kill_all_client_conns && test_and_set_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) rxrpc_wake_up_io_thread(local); } /* * Allocate a new local endpoint. */ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; u32 tmp; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { refcount_set(&local->ref, 1); atomic_set(&local->active_users, 1); local->net = net; local->rxnet = rxrpc_net(net); INIT_HLIST_NODE(&local->link); init_completion(&local->io_thread_ready); #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY skb_queue_head_init(&local->rx_delay_queue); #endif skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->conn_attend_q); INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); local->kill_all_client_conns = false; INIT_LIST_HEAD(&local->idle_client_conns); timer_setup(&local->client_conn_reap_timer, rxrpc_client_conn_reap_timeout, 0); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; idr_init(&local->conn_ids); get_random_bytes(&tmp, sizeof(tmp)); tmp &= 0x3fffffff; if (tmp == 0) tmp = 1; idr_set_cursor(&local->conn_ids, tmp); INIT_LIST_HEAD(&local->new_client_calls); spin_lock_init(&local->client_call_lock); trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1); } _leave(" = %p", local); return local; } /* * create the local socket * - must be called with rxrpc_local_mutex locked */ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct sockaddr_rxrpc *srx = &local->srx; struct udp_port_cfg udp_conf = {0}; struct task_struct *io_thread; struct sock *usk; int ret; _enter("%p{%d,%d}", local, srx->transport_type, srx->transport.family); udp_conf.family = srx->transport.family; udp_conf.use_udp_checksums = true; if (udp_conf.family == AF_INET) { udp_conf.local_ip = srx->transport.sin.sin_addr; udp_conf.local_udp_port = srx->transport.sin.sin_port; #if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) } else { udp_conf.local_ip6 = srx->transport.sin6.sin6_addr; udp_conf.local_udp_port = srx->transport.sin6.sin6_port; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; #endif } ret = udp_sock_create(net, &udp_conf, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; } tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_encap_rcv; tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); /* set the socket up */ usk = local->socket->sk; usk->sk_error_report = rxrpc_error_report; switch (srx->transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ ip6_sock_set_recverr(usk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. */ fallthrough; case AF_INET: /* we want to receive ICMP errors */ ip_sock_set_recverr(usk); /* we want to set the don't fragment bit */ rxrpc_local_dont_fragment(local, true); break; default: BUG(); } io_thread = kthread_run(rxrpc_io_thread, local, "krxrpcio/%u", ntohs(udp_conf.local_udp_port)); if (IS_ERR(io_thread)) { ret = PTR_ERR(io_thread); goto error_sock; } wait_for_completion(&local->io_thread_ready); WRITE_ONCE(local->io_thread, io_thread); _leave(" = 0"); return 0; error_sock: kernel_sock_shutdown(local->socket, SHUT_RDWR); local->socket->sk->sk_user_data = NULL; sock_release(local->socket); local->socket = NULL; return ret; } /* * Look up or create a new local endpoint using the specified local address. */ struct rxrpc_local *rxrpc_lookup_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; struct rxrpc_net *rxnet = rxrpc_net(net); struct hlist_node *cursor; long diff; int ret; _enter("{%d,%d,%pISp}", srx->transport_type, srx->transport.family, &srx->transport); mutex_lock(&rxnet->local_mutex); hlist_for_each(cursor, &rxnet->local_endpoints) { local = hlist_entry(cursor, struct rxrpc_local, link); diff = rxrpc_local_cmp_key(local, srx); if (diff != 0) continue; /* Services aren't allowed to share transport sockets, so * reject that here. It is possible that the object is dying - * but it may also still have the local transport address that * we want bound. */ if (srx->srx_service) { local = NULL; goto addr_in_use; } /* Found a match. We want to replace a dying object. * Attempting to bind the transport socket may still fail if * we're attempting to use a local address that the dying * object is still using. */ if (!rxrpc_use_local(local, rxrpc_local_use_lookup)) break; goto found; } local = rxrpc_alloc_local(net, srx); if (!local) goto nomem; ret = rxrpc_open_socket(local, net); if (ret < 0) goto sock_error; if (cursor) { hlist_replace_rcu(cursor, &local->link); cursor->pprev = NULL; } else { hlist_add_head_rcu(&local->link, &rxnet->local_endpoints); } found: mutex_unlock(&rxnet->local_mutex); _leave(" = %p", local); return local; nomem: ret = -ENOMEM; sock_error: mutex_unlock(&rxnet->local_mutex); if (local) call_rcu(&local->rcu, rxrpc_local_rcu); _leave(" = %d", ret); return ERR_PTR(ret); addr_in_use: mutex_unlock(&rxnet->local_mutex); _leave(" = -EADDRINUSE"); return ERR_PTR(-EADDRINUSE); } /* * Get a ref on a local endpoint. */ struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; u = atomic_read(&local->active_users); __refcount_inc(&local->ref, &r); trace_rxrpc_local(local->debug_id, why, r + 1, u); return local; } /* * Get a ref on a local endpoint unless its usage has already reached 0. */ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; if (local && __refcount_inc_not_zero(&local->ref, &r)) { u = atomic_read(&local->active_users); trace_rxrpc_local(local->debug_id, why, r + 1, u); return local; } return NULL; } /* * Drop a ref on a local endpoint. */ void rxrpc_put_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { unsigned int debug_id; bool dead; int r, u; if (local) { debug_id = local->debug_id; u = atomic_read(&local->active_users); dead = __refcount_dec_and_test(&local->ref, &r); trace_rxrpc_local(debug_id, why, r, u); if (dead) call_rcu(&local->rcu, rxrpc_local_rcu); } } /* * Start using a local endpoint. */ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { local = rxrpc_get_local_maybe(local, rxrpc_local_get_for_use); if (!local) return NULL; if (!__rxrpc_use_local(local, why)) { rxrpc_put_local(local, rxrpc_local_put_for_use); return NULL; } return local; } /* * Cease using a local endpoint. Once the number of active users reaches 0, we * start the closure of the transport in the I/O thread.. */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { unsigned int debug_id; int r, u; if (local) { debug_id = local->debug_id; r = refcount_read(&local->ref); u = atomic_dec_return(&local->active_users); trace_rxrpc_local(debug_id, why, r, u); if (u == 0) kthread_stop(local->io_thread); } } /* * Destroy a local endpoint's socket and then hand the record to RCU to dispose * of. * * Closing the socket cannot be done from bottom half context or RCU callback * context because it might sleep. */ void rxrpc_destroy_local(struct rxrpc_local *local) { struct socket *socket = local->socket; struct rxrpc_net *rxnet = local->rxnet; _enter("%d", local->debug_id); local->dead = true; mutex_lock(&rxnet->local_mutex); hlist_del_init_rcu(&local->link); mutex_unlock(&rxnet->local_mutex); rxrpc_clean_up_local_conns(local); rxrpc_service_connection_reaper(&rxnet->service_conn_reaper); ASSERT(!local->service); if (socket) { local->socket = NULL; kernel_sock_shutdown(socket, SHUT_RDWR); socket->sk->sk_user_data = NULL; sock_release(socket); } /* At this point, there should be no more packets coming in to the * local endpoint. */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY rxrpc_purge_queue(&local->rx_delay_queue); #endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); page_frag_cache_drain(&local->tx_alloc); } /* * Destroy a local endpoint after the RCU grace period expires. */ static void rxrpc_local_rcu(struct rcu_head *rcu) { struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu); rxrpc_see_local(local, rxrpc_local_free); kfree(local); } /* * Verify the local endpoint list is empty by this point. */ void rxrpc_destroy_all_locals(struct rxrpc_net *rxnet) { struct rxrpc_local *local; _enter(""); flush_workqueue(rxrpc_workqueue); if (!hlist_empty(&rxnet->local_endpoints)) { mutex_lock(&rxnet->local_mutex); hlist_for_each_entry(local, &rxnet->local_endpoints, link) { pr_err("AF_RXRPC: Leaked local %p {%d}\n", local, refcount_read(&local->ref)); } mutex_unlock(&rxnet->local_mutex); BUG(); } }
13 13 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 // SPDX-License-Identifier: GPL-2.0-or-later /* RSA asymmetric public-key algorithm [RFC3447] * * Copyright (c) 2015, Intel Corporation * Authors: Tadeusz Struk <tadeusz.struk@intel.com> */ #include <linux/fips.h> #include <linux/module.h> #include <linux/mpi.h> #include <crypto/internal/rsa.h> #include <crypto/internal/akcipher.h> #include <crypto/akcipher.h> #include <crypto/algapi.h> struct rsa_mpi_key { MPI n; MPI e; MPI d; MPI p; MPI q; MPI dp; MPI dq; MPI qinv; }; static int rsa_check_payload(MPI x, MPI n) { MPI n1; if (mpi_cmp_ui(x, 1) <= 0) return -EINVAL; n1 = mpi_alloc(0); if (!n1) return -ENOMEM; if (mpi_sub_ui(n1, n, 1) || mpi_cmp(x, n1) >= 0) { mpi_free(n1); return -EINVAL; } mpi_free(n1); return 0; } /* * RSAEP function [RFC3447 sec 5.1.1] * c = m^e mod n; */ static int _rsa_enc(const struct rsa_mpi_key *key, MPI c, MPI m) { /* * Even though (1) in RFC3447 only requires 0 <= m <= n - 1, we are * slightly more conservative and require 1 < m < n - 1. This is in line * with SP 800-56Br2, Section 7.1.1. */ if (rsa_check_payload(m, key->n)) return -EINVAL; /* (2) c = m^e mod n */ return mpi_powm(c, m, key->e, key->n); } /* * RSADP function [RFC3447 sec 5.1.2] * m_1 = c^dP mod p; * m_2 = c^dQ mod q; * h = (m_1 - m_2) * qInv mod p; * m = m_2 + q * h; */ static int _rsa_dec_crt(const struct rsa_mpi_key *key, MPI m_or_m1_or_h, MPI c) { MPI m2, m12_or_qh; int ret = -ENOMEM; /* * Even though (1) in RFC3447 only requires 0 <= c <= n - 1, we are * slightly more conservative and require 1 < c < n - 1. This is in line * with SP 800-56Br2, Section 7.1.2. */ if (rsa_check_payload(c, key->n)) return -EINVAL; m2 = mpi_alloc(0); m12_or_qh = mpi_alloc(0); if (!m2 || !m12_or_qh) goto err_free_mpi; /* (2i) m_1 = c^dP mod p */ ret = mpi_powm(m_or_m1_or_h, c, key->dp, key->p); if (ret) goto err_free_mpi; /* (2i) m_2 = c^dQ mod q */ ret = mpi_powm(m2, c, key->dq, key->q); if (ret) goto err_free_mpi; /* (2iii) h = (m_1 - m_2) * qInv mod p */ ret = mpi_sub(m12_or_qh, m_or_m1_or_h, m2) ?: mpi_mulm(m_or_m1_or_h, m12_or_qh, key->qinv, key->p); /* (2iv) m = m_2 + q * h */ ret = ret ?: mpi_mul(m12_or_qh, key->q, m_or_m1_or_h) ?: mpi_addm(m_or_m1_or_h, m2, m12_or_qh, key->n); err_free_mpi: mpi_free(m12_or_qh); mpi_free(m2); return ret; } static inline struct rsa_mpi_key *rsa_get_key(struct crypto_akcipher *tfm) { return akcipher_tfm_ctx(tfm); } static int rsa_enc(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); const struct rsa_mpi_key *pkey = rsa_get_key(tfm); MPI m, c = mpi_alloc(0); int ret = 0; int sign; if (!c) return -ENOMEM; if (unlikely(!pkey->n || !pkey->e)) { ret = -EINVAL; goto err_free_c; } ret = -ENOMEM; m = mpi_read_raw_from_sgl(req->src, req->src_len); if (!m) goto err_free_c; ret = _rsa_enc(pkey, c, m); if (ret) goto err_free_m; ret = mpi_write_to_sgl(c, req->dst, req->dst_len, &sign); if (ret) goto err_free_m; if (sign < 0) ret = -EBADMSG; err_free_m: mpi_free(m); err_free_c: mpi_free(c); return ret; } static int rsa_dec(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); const struct rsa_mpi_key *pkey = rsa_get_key(tfm); MPI c, m = mpi_alloc(0); int ret = 0; int sign; if (!m) return -ENOMEM; if (unlikely(!pkey->n || !pkey->d)) { ret = -EINVAL; goto err_free_m; } ret = -ENOMEM; c = mpi_read_raw_from_sgl(req->src, req->src_len); if (!c) goto err_free_m; ret = _rsa_dec_crt(pkey, m, c); if (ret) goto err_free_c; ret = mpi_write_to_sgl(m, req->dst, req->dst_len, &sign); if (ret) goto err_free_c; if (sign < 0) ret = -EBADMSG; err_free_c: mpi_free(c); err_free_m: mpi_free(m); return ret; } static void rsa_free_mpi_key(struct rsa_mpi_key *key) { mpi_free(key->d); mpi_free(key->e); mpi_free(key->n); mpi_free(key->p); mpi_free(key->q); mpi_free(key->dp); mpi_free(key->dq); mpi_free(key->qinv); key->d = NULL; key->e = NULL; key->n = NULL; key->p = NULL; key->q = NULL; key->dp = NULL; key->dq = NULL; key->qinv = NULL; } static int rsa_check_key_length(unsigned int len) { switch (len) { case 512: case 1024: case 1536: if (fips_enabled) return -EINVAL; fallthrough; case 2048: case 3072: case 4096: return 0; } return -EINVAL; } static int rsa_check_exponent_fips(MPI e) { MPI e_max = NULL; int err; /* check if odd */ if (!mpi_test_bit(e, 0)) { return -EINVAL; } /* check if 2^16 < e < 2^256. */ if (mpi_cmp_ui(e, 65536) <= 0) { return -EINVAL; } e_max = mpi_alloc(0); if (!e_max) return -ENOMEM; err = mpi_set_bit(e_max, 256); if (err) { mpi_free(e_max); return err; } if (mpi_cmp(e, e_max) >= 0) { mpi_free(e_max); return -EINVAL; } mpi_free(e_max); return 0; } static int rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { struct rsa_mpi_key *mpi_key = akcipher_tfm_ctx(tfm); struct rsa_key raw_key = {0}; int ret; /* Free the old MPI key if any */ rsa_free_mpi_key(mpi_key); ret = rsa_parse_pub_key(&raw_key, key, keylen); if (ret) return ret; mpi_key->e = mpi_read_raw_data(raw_key.e, raw_key.e_sz); if (!mpi_key->e) goto err; mpi_key->n = mpi_read_raw_data(raw_key.n, raw_key.n_sz); if (!mpi_key->n) goto err; if (rsa_check_key_length(mpi_get_size(mpi_key->n) << 3)) { rsa_free_mpi_key(mpi_key); return -EINVAL; } if (fips_enabled && rsa_check_exponent_fips(mpi_key->e)) { rsa_free_mpi_key(mpi_key); return -EINVAL; } return 0; err: rsa_free_mpi_key(mpi_key); return -ENOMEM; } static int rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { struct rsa_mpi_key *mpi_key = akcipher_tfm_ctx(tfm); struct rsa_key raw_key = {0}; int ret; /* Free the old MPI key if any */ rsa_free_mpi_key(mpi_key); ret = rsa_parse_priv_key(&raw_key, key, keylen); if (ret) return ret; mpi_key->d = mpi_read_raw_data(raw_key.d, raw_key.d_sz); if (!mpi_key->d) goto err; mpi_key->e = mpi_read_raw_data(raw_key.e, raw_key.e_sz); if (!mpi_key->e) goto err; mpi_key->n = mpi_read_raw_data(raw_key.n, raw_key.n_sz); if (!mpi_key->n) goto err; mpi_key->p = mpi_read_raw_data(raw_key.p, raw_key.p_sz); if (!mpi_key->p) goto err; mpi_key->q = mpi_read_raw_data(raw_key.q, raw_key.q_sz); if (!mpi_key->q) goto err; mpi_key->dp = mpi_read_raw_data(raw_key.dp, raw_key.dp_sz); if (!mpi_key->dp) goto err; mpi_key->dq = mpi_read_raw_data(raw_key.dq, raw_key.dq_sz); if (!mpi_key->dq) goto err; mpi_key->qinv = mpi_read_raw_data(raw_key.qinv, raw_key.qinv_sz); if (!mpi_key->qinv) goto err; if (rsa_check_key_length(mpi_get_size(mpi_key->n) << 3)) { rsa_free_mpi_key(mpi_key); return -EINVAL; } if (fips_enabled && rsa_check_exponent_fips(mpi_key->e)) { rsa_free_mpi_key(mpi_key); return -EINVAL; } return 0; err: rsa_free_mpi_key(mpi_key); return -ENOMEM; } static unsigned int rsa_max_size(struct crypto_akcipher *tfm) { struct rsa_mpi_key *pkey = akcipher_tfm_ctx(tfm); return mpi_get_size(pkey->n); } static void rsa_exit_tfm(struct crypto_akcipher *tfm) { struct rsa_mpi_key *pkey = akcipher_tfm_ctx(tfm); rsa_free_mpi_key(pkey); } static struct akcipher_alg rsa = { .encrypt = rsa_enc, .decrypt = rsa_dec, .set_priv_key = rsa_set_priv_key, .set_pub_key = rsa_set_pub_key, .max_size = rsa_max_size, .exit = rsa_exit_tfm, .base = { .cra_name = "rsa", .cra_driver_name = "rsa-generic", .cra_priority = 100, .cra_module = THIS_MODULE, .cra_ctxsize = sizeof(struct rsa_mpi_key), }, }; static int __init rsa_init(void) { int err; err = crypto_register_akcipher(&rsa); if (err) return err; err = crypto_register_template(&rsa_pkcs1pad_tmpl); if (err) goto err_unregister_rsa; err = crypto_register_template(&rsassa_pkcs1_tmpl); if (err) goto err_unregister_rsa_pkcs1pad; return 0; err_unregister_rsa_pkcs1pad: crypto_unregister_template(&rsa_pkcs1pad_tmpl); err_unregister_rsa: crypto_unregister_akcipher(&rsa); return err; } static void __exit rsa_exit(void) { crypto_unregister_template(&rsassa_pkcs1_tmpl); crypto_unregister_template(&rsa_pkcs1pad_tmpl); crypto_unregister_akcipher(&rsa); } module_init(rsa_init); module_exit(rsa_exit); MODULE_ALIAS_CRYPTO("rsa"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RSA generic algorithm");
16 16 16 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0 /* Builtin firmware support */ #include <linux/firmware.h> #include "../firmware.h" /* Only if FW_LOADER=y */ #ifdef CONFIG_FW_LOADER struct builtin_fw { char *name; void *data; unsigned long size; }; extern struct builtin_fw __start_builtin_fw[]; extern struct builtin_fw __end_builtin_fw[]; static bool fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) { if (!buf) return true; if (size < fw->size) return false; memcpy(buf, fw->data, fw->size); return true; } /** * firmware_request_builtin() - load builtin firmware * @fw: pointer to firmware struct * @name: name of firmware file * * Some use cases in the kernel have a requirement so that no memory allocator * is involved as these calls take place early in boot process. An example is * the x86 CPU microcode loader. In these cases all the caller wants is to see * if the firmware was built-in and if so use it right away. This can be used * for such cases. * * This looks for the firmware in the built-in kernel. Only if the kernel was * built-in with the firmware you are looking for will this return successfully. * * Callers of this API do not need to use release_firmware() as the pointer to * the firmware is expected to be provided locally on the stack of the caller. **/ bool firmware_request_builtin(struct firmware *fw, const char *name) { struct builtin_fw *b_fw; if (!fw) return false; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { if (strcmp(name, b_fw->name) == 0) { fw->size = b_fw->size; fw->data = b_fw->data; return true; } } return false; } EXPORT_SYMBOL_NS_GPL(firmware_request_builtin, "TEST_FIRMWARE"); /** * firmware_request_builtin_buf() - load builtin firmware into optional buffer * @fw: pointer to firmware struct * @name: name of firmware file * @buf: If set this lets you use a pre-allocated buffer so that the built-in * firmware into is copied into. This field can be NULL. It is used by * callers such as request_firmware_into_buf() and * request_partial_firmware_into_buf() * @size: if buf was provided, the max size of the allocated buffer available. * If the built-in firmware does not fit into the pre-allocated @buf this * call will fail. * * This looks for the firmware in the built-in kernel. Only if the kernel was * built-in with the firmware you are looking for will this call possibly * succeed. If you passed a @buf the firmware will be copied into it *iff* the * built-in firmware fits into the pre-allocated buffer size specified in * @size. * * This caller is to be used internally by the firmware_loader only. **/ bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { if (!firmware_request_builtin(fw, name)) return false; return fw_copy_to_prealloc_buf(fw, buf, size); } bool firmware_is_builtin(const struct firmware *fw) { struct builtin_fw *b_fw; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) if (fw->data == b_fw->data) return true; return false; } #endif
30 30 30 30 30 30 1 29 29 2 1 30 30 30 30 29 29 30 5 29 29 30 29 30 5 23 2 28 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 // SPDX-License-Identifier: GPL-2.0-or-later /* * Forwarding decision * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netpoll.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <linux/netfilter_bridge.h> #include "br_private.h" /* Don't forward packets to originating port or forwarding disabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { struct net_bridge_vlan_group *vg; vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && (br_mst_is_enabled(p) || p->state == BR_STATE_FORWARDING) && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_push(skb, ETH_HLEN); if (!is_skb_forwardable(skb->dev, skb)) goto drop; br_drop_fake_rtable(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && eth_type_vlan(skb->protocol)) { int depth; if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth)) goto drop; skb_set_network_header(skb, depth); } br_switchdev_frame_set_offload_fwd_mark(skb); dev_queue_xmit(skb); return 0; drop: kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_tstamp(skb); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); } EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_orig) { struct net_bridge_vlan_group *vg; struct net_device *indev; struct net *net; int br_hook; /* Mark the skb for forwarding offload early so that br_handle_vlan() * can know whether to pop the VLAN header on egress or keep it. */ nbp_switchdev_frame_mark_tx_fwd_offload(to, skb); vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, to, vg, skb); if (!skb) return; indev = skb->dev; skb->dev = to->dev; if (!local_orig) { if (skb_warn_if_lro(skb)) { kfree_skb(skb); return; } br_hook = NF_BR_FORWARD; skb_forward_csum(skb); net = dev_net(indev); } else { if (unlikely(netpoll_tx_running(to->br->dev))) { skb_push(skb, ETH_HLEN); if (!is_skb_forwardable(skb->dev, skb)) kfree_skb(skb); else br_netpoll_send_skb(to, skb); return; } br_hook = NF_BR_LOCAL_OUT; net = dev_net(skb->dev); indev = NULL; } NF_HOOK(NFPROTO_BRIDGE, br_hook, net, NULL, skb, indev, skb->dev, br_forward_finish); } static int deliver_clone(const struct net_bridge_port *prev, struct sk_buff *skb, bool local_orig) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) { DEV_STATS_INC(dev, tx_dropped); return -ENOMEM; } __br_forward(prev, skb, local_orig); return 0; } /** * br_forward - forward a packet to a specific port * @to: destination port * @skb: packet being forwarded * @local_rcv: packet will be received locally after forwarding * @local_orig: packet is locally originated * * Should be called with rcu_read_lock. */ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_rcv, bool local_orig) { if (unlikely(!to)) goto out; /* redirect to backup link if the destination port is down */ if (rcu_access_pointer(to->backup_port) && (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) { struct net_bridge_port *backup_port; backup_port = rcu_dereference(to->backup_port); if (unlikely(!backup_port)) goto out; BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid); to = backup_port; } if (should_deliver(to, skb)) { if (local_rcv) deliver_clone(to, skb, local_orig); else __br_forward(to, skb, local_orig); return; } out: if (!local_rcv) kfree_skb(skb); } EXPORT_SYMBOL_GPL(br_forward); static struct net_bridge_port *maybe_deliver( struct net_bridge_port *prev, struct net_bridge_port *p, struct sk_buff *skb, bool local_orig) { u8 igmp_type = br_multicast_igmp_type(skb); int err; if (!should_deliver(p, skb)) return prev; nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb); if (!prev) goto out; err = deliver_clone(prev, skb, local_orig); if (err) return ERR_PTR(err); out: br_multicast_count(p->br, p, skb, igmp_type, BR_MCAST_DIR_TX); return p; } /* called under rcu_read_lock */ void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig, u16 vid) { enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET; struct net_bridge_port *prev = NULL; struct net_bridge_port *p; br_tc_skb_miss_set(skb, pkt_type != BR_PKT_BROADCAST); list_for_each_entry_rcu(p, &br->port_list, list) { /* Do not flood unicast traffic to ports that turn it off, nor * other traffic if flood off, except for traffic we originate */ switch (pkt_type) { case BR_PKT_UNICAST: if (!(p->flags & BR_FLOOD)) continue; break; case BR_PKT_MULTICAST: if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) continue; break; case BR_PKT_BROADCAST: if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev) continue; break; } /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; if (BR_INPUT_SKB_CB(skb)->proxyarp_replied && ((p->flags & BR_PROXYARP_WIFI) || br_is_neigh_suppress_enabled(p, vid))) continue; prev = maybe_deliver(prev, p, skb, local_orig); if (IS_ERR(prev)) { reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM : SKB_DROP_REASON_NOT_SPECIFIED; goto out; } } if (!prev) goto out; if (local_rcv) deliver_clone(prev, skb, local_orig); else __br_forward(prev, skb, local_orig); return; out: if (!local_rcv) kfree_skb_reason(skb, reason); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, const unsigned char *addr, bool local_orig) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; const unsigned char *src = eth_hdr(skb)->h_source; struct sk_buff *nskb; if (!should_deliver(p, skb)) return; /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */ if (skb->dev == p->dev && ether_addr_equal(src, addr)) return; __skb_push(skb, ETH_HLEN); nskb = pskb_copy(skb, GFP_ATOMIC); __skb_pull(skb, ETH_HLEN); if (!nskb) { DEV_STATS_INC(dev, tx_dropped); return; } skb = nskb; __skb_pull(skb, ETH_HLEN); if (!is_broadcast_ether_addr(addr)) memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN); __br_forward(p, skb, local_orig); } /* called with rcu_read_lock */ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET; struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; bool allow_mode_include = true; struct hlist_node *rp; rp = br_multicast_get_first_rport_node(brmctx, skb); if (mdst) { p = rcu_dereference(mdst->ports); if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) && br_multicast_is_star_g(&mdst->addr)) allow_mode_include = false; } else { p = NULL; br_tc_skb_miss_set(skb, true); } while (p || rp) { struct net_bridge_port *port, *lport, *rport; lport = p ? p->key.port : NULL; rport = br_multicast_rport_from_node_skb(rp, skb); if ((unsigned long)lport > (unsigned long)rport) { port = lport; if (port->flags & BR_MULTICAST_TO_UNICAST) { maybe_deliver_addr(lport, skb, p->eth_addr, local_orig); goto delivered; } if ((!allow_mode_include && p->filter_mode == MCAST_INCLUDE) || (p->flags & MDB_PG_FLAGS_BLOCKED)) goto delivered; } else { port = rport; } prev = maybe_deliver(prev, port, skb, local_orig); if (IS_ERR(prev)) { reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM : SKB_DROP_REASON_NOT_SPECIFIED; goto out; } delivered: if ((unsigned long)lport >= (unsigned long)port) p = rcu_dereference(p->next); if ((unsigned long)rport >= (unsigned long)port) rp = rcu_dereference(hlist_next_rcu(rp)); } if (!prev) goto out; if (local_rcv) deliver_clone(prev, skb, local_orig); else __br_forward(prev, skb, local_orig); return; out: if (!local_rcv) kfree_skb_reason(skb, reason); } #endif
1760 1760 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_H #define _ASM_X86_PAGE_H #include <linux/types.h> #ifdef __KERNEL__ #include <asm/page_types.h> #ifdef CONFIG_X86_64 #include <asm/page_64.h> #else #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLER__ struct page; #include <linux/range.h> extern struct range pfn_mapped[]; extern int nr_pfn_mapped; static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); } static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { copy_page(to, from); } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) #endif #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ /* * We need __phys_reloc_hide() here because gcc may assume that there is no * overflow during __pa() calculation and can optimize it unexpectedly. * Newer versions of gcc provide -fno-strict-overflow switch to handle this * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) /* * virt_to_page(kaddr) returns a valid pointer if and only if * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) static __always_inline void *pfn_to_kaddr(unsigned long pfn) { return __va(pfn << PAGE_SHIFT); } static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) { return __canonical_address(vaddr, vaddr_bits) == vaddr; } #endif /* __ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */
15 8 16 22 20 1 1238 1236 1232 12 2 22 22 6 21 21 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static void siw_destroy_cpulist(int number) { int i = 0; while (i < number) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; } static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; siw_destroy_cpulist(i); return -ENOMEM; } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * All other events are not handled */ default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw");
45 13 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Hash algorithms. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_HASH_H #define _CRYPTO_INTERNAL_HASH_H #include <crypto/algapi.h> #include <crypto/hash.h> /* Set this bit to handle partial blocks in the API. */ #define CRYPTO_AHASH_ALG_BLOCK_ONLY 0x01000000 /* Set this bit if final requires at least one byte. */ #define CRYPTO_AHASH_ALG_FINAL_NONZERO 0x02000000 /* Set this bit if finup can deal with multiple blocks. */ #define CRYPTO_AHASH_ALG_FINUP_MAX 0x04000000 /* This bit is set by the Crypto API if export_core is not supported. */ #define CRYPTO_AHASH_ALG_NO_EXPORT_CORE 0x08000000 #define HASH_FBREQ_ON_STACK(name, req) \ char __##name##_req[sizeof(struct ahash_request) + \ MAX_SYNC_HASH_REQSIZE] CRYPTO_MINALIGN_ATTR; \ struct ahash_request *name = ahash_fbreq_on_stack_init( \ __##name##_req, (req)) struct ahash_request; struct scatterlist; struct crypto_hash_walk { const char *data; unsigned int offset; unsigned int flags; struct page *pg; unsigned int entrylen; unsigned int total; struct scatterlist *sg; }; struct ahash_instance { void (*free)(struct ahash_instance *inst); union { struct { char head[offsetof(struct ahash_alg, halg.base)]; struct crypto_instance base; } s; struct ahash_alg alg; }; }; struct shash_instance { void (*free)(struct shash_instance *inst); union { struct { char head[offsetof(struct shash_alg, base)]; struct crypto_instance base; } s; struct shash_alg alg; }; }; struct crypto_ahash_spawn { struct crypto_spawn base; }; struct crypto_shash_spawn { struct crypto_spawn base; }; int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err); int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk); static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk) { return !(walk->entrylen | walk->total); } int crypto_register_ahash(struct ahash_alg *alg); void crypto_unregister_ahash(struct ahash_alg *alg); int crypto_register_ahashes(struct ahash_alg *algs, int count); void crypto_unregister_ahashes(struct ahash_alg *algs, int count); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst); void ahash_free_singlespawn_instance(struct ahash_instance *inst); int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); static inline bool crypto_shash_alg_has_setkey(struct shash_alg *alg) { return alg->setkey != shash_no_setkey; } bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg); static inline bool crypto_shash_alg_needs_key(struct shash_alg *alg) { return crypto_shash_alg_has_setkey(alg) && !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY); } static inline bool crypto_hash_alg_needs_key(struct hash_alg_common *alg) { return crypto_hash_alg_has_setkey(alg) && !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY); } static inline bool crypto_hash_no_export_core(struct crypto_ahash *tfm) { return crypto_hash_alg_common(tfm)->base.cra_flags & CRYPTO_AHASH_ALG_NO_EXPORT_CORE; } int crypto_grab_ahash(struct crypto_ahash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_ahash(struct crypto_ahash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct hash_alg_common *crypto_spawn_ahash_alg( struct crypto_ahash_spawn *spawn) { return __crypto_hash_alg_common(spawn->base.alg); } int crypto_register_shash(struct shash_alg *alg); void crypto_unregister_shash(struct shash_alg *alg); int crypto_register_shashes(struct shash_alg *algs, int count); void crypto_unregister_shashes(struct shash_alg *algs, int count); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst); void shash_free_singlespawn_instance(struct shash_instance *inst); int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct shash_alg *crypto_spawn_shash_alg( struct crypto_shash_spawn *spawn) { return __crypto_shash_alg(spawn->base.alg); } int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc); static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) { return crypto_tfm_ctx(crypto_ahash_tfm(tfm)); } static inline void *crypto_ahash_ctx_dma(struct crypto_ahash *tfm) { return crypto_tfm_ctx_dma(crypto_ahash_tfm(tfm)); } static inline struct ahash_alg *__crypto_ahash_alg(struct crypto_alg *alg) { return container_of(__crypto_hash_alg_common(alg), struct ahash_alg, halg); } static inline struct ahash_alg *crypto_ahash_alg(struct crypto_ahash *hash) { return container_of(crypto_hash_alg_common(hash), struct ahash_alg, halg); } static inline void crypto_ahash_set_statesize(struct crypto_ahash *tfm, unsigned int size) { tfm->statesize = size; } static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm, unsigned int reqsize) { tfm->reqsize = reqsize; } static inline bool crypto_ahash_tested(struct crypto_ahash *tfm) { struct crypto_tfm *tfm_base = crypto_ahash_tfm(tfm); return tfm_base->__crt_alg->cra_flags & CRYPTO_ALG_TESTED; } static inline void crypto_ahash_set_reqsize_dma(struct crypto_ahash *ahash, unsigned int reqsize) { reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); ahash->reqsize = reqsize; } static inline struct crypto_instance *ahash_crypto_instance( struct ahash_instance *inst) { return &inst->s.base; } static inline struct ahash_instance *ahash_instance( struct crypto_instance *inst) { return container_of(inst, struct ahash_instance, s.base); } static inline struct ahash_instance *ahash_alg_instance( struct crypto_ahash *ahash) { return ahash_instance(crypto_tfm_alg_instance(&ahash->base)); } static inline void *ahash_instance_ctx(struct ahash_instance *inst) { return crypto_instance_ctx(ahash_crypto_instance(inst)); } static inline void *ahash_request_ctx_dma(struct ahash_request *req) { unsigned int align = crypto_dma_align(); if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(ahash_request_ctx(req), align); } static inline void ahash_request_complete(struct ahash_request *req, int err) { crypto_request_complete(&req->base, err); } static inline u32 ahash_request_flags(struct ahash_request *req) { return crypto_request_flags(&req->base) & ~CRYPTO_AHASH_REQ_PRIVATE; } static inline struct crypto_ahash *crypto_spawn_ahash( struct crypto_ahash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline int ahash_enqueue_request(struct crypto_queue *queue, struct ahash_request *request) { return crypto_enqueue_request(queue, &request->base); } static inline struct ahash_request *ahash_dequeue_request( struct crypto_queue *queue) { return ahash_request_cast(crypto_dequeue_request(queue)); } static inline void *crypto_shash_ctx(struct crypto_shash *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline struct crypto_instance *shash_crypto_instance( struct shash_instance *inst) { return &inst->s.base; } static inline struct shash_instance *shash_instance( struct crypto_instance *inst) { return container_of(inst, struct shash_instance, s.base); } static inline struct shash_instance *shash_alg_instance( struct crypto_shash *shash) { return shash_instance(crypto_tfm_alg_instance(&shash->base)); } static inline void *shash_instance_ctx(struct shash_instance *inst) { return crypto_instance_ctx(shash_crypto_instance(inst)); } static inline struct crypto_shash *crypto_spawn_shash( struct crypto_shash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_shash, base); } static inline bool ahash_request_isvirt(struct ahash_request *req) { return req->base.flags & CRYPTO_AHASH_REQ_VIRT; } static inline bool crypto_ahash_req_virt(struct crypto_ahash *tfm) { return crypto_tfm_req_virt(&tfm->base); } static inline struct crypto_ahash *crypto_ahash_fb(struct crypto_ahash *tfm) { return __crypto_ahash_cast(crypto_ahash_tfm(tfm)->fb); } static inline struct ahash_request *ahash_fbreq_on_stack_init( char *buf, struct ahash_request *old) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(old); struct ahash_request *req = (void *)buf; crypto_stack_request_init(&req->base, crypto_ahash_tfm(crypto_ahash_fb(tfm))); ahash_request_set_callback(req, ahash_request_flags(old), NULL, NULL); req->base.flags &= ~CRYPTO_AHASH_REQ_PRIVATE; req->base.flags |= old->base.flags & CRYPTO_AHASH_REQ_PRIVATE; req->src = old->src; req->result = old->result; req->nbytes = old->nbytes; return req; } /* Return the state size without partial block for block-only algorithms. */ static inline unsigned int crypto_shash_coresize(struct crypto_shash *tfm) { return crypto_shash_statesize(tfm) - crypto_shash_blocksize(tfm) - 1; } /* This can only be used if the request was never cloned. */ #define HASH_REQUEST_ZERO(name) \ memzero_explicit(__##name##_req, sizeof(__##name##_req)) /** * crypto_ahash_export_core() - extract core state for message digest * @req: reference to the ahash_request handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * Export the hash state without the partial block buffer. * * Context: Softirq or process context. * Return: 0 if the export creation was successful; < 0 if an error occurred */ int crypto_ahash_export_core(struct ahash_request *req, void *out); /** * crypto_ahash_import_core() - import core state * @req: reference to ahash_request handle the state is imported into * @in: buffer holding the state * * Import the hash state without the partial block buffer. * * Context: Softirq or process context. * Return: 0 if the import was successful; < 0 if an error occurred */ int crypto_ahash_import_core(struct ahash_request *req, const void *in); /** * crypto_shash_export_core() - extract core state for message digest * @desc: reference to the operational state handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * Export the hash state without the partial block buffer. * * Context: Softirq or process context. * Return: 0 if the export creation was successful; < 0 if an error occurred */ int crypto_shash_export_core(struct shash_desc *desc, void *out); /** * crypto_shash_import_core() - import core state * @desc: reference to the operational state handle the state imported into * @in: buffer holding the state * * Import the hash state without the partial block buffer. * * Context: Softirq or process context. * Return: 0 if the import was successful; < 0 if an error occurred */ int crypto_shash_import_core(struct shash_desc *desc, const void *in); #endif /* _CRYPTO_INTERNAL_HASH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vsyscall #if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __VSYSCALL_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(emulate_vsyscall, TP_PROTO(int nr), TP_ARGS(nr), TP_STRUCT__entry(__field(int, nr)), TP_fast_assign( __entry->nr = nr; ), TP_printk("nr = %d", __entry->nr) ); #endif #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ #define TRACE_INCLUDE_FILE vsyscall_trace #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 /* SPDX-License-Identifier: GPL-2.0-only */ /* L2TP internal definitions. * * Copyright (c) 2008,2009 Katalix Systems Ltd */ #include <linux/refcount.h> #ifndef _L2TP_CORE_H_ #define _L2TP_CORE_H_ #include <net/dst.h> #include <net/sock.h> #ifdef CONFIG_XFRM #include <net/xfrm.h> #endif /* Random numbers used for internal consistency checks of tunnel and session structures */ #define L2TP_SESSION_MAGIC 0x0C04EB7D struct sk_buff; struct l2tp_stats { atomic_long_t tx_packets; atomic_long_t tx_bytes; atomic_long_t tx_errors; atomic_long_t rx_packets; atomic_long_t rx_bytes; atomic_long_t rx_seq_discards; atomic_long_t rx_oos_packets; atomic_long_t rx_errors; atomic_long_t rx_cookie_discards; atomic_long_t rx_invalid; }; struct l2tp_tunnel; /* L2TP session configuration */ struct l2tp_session_cfg { enum l2tp_pwtype pw_type; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ u8 peer_cookie[8]; /* peer's cookie */ int peer_cookie_len; /* 0, 4 or 8 bytes */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ char *ifname; }; struct l2tp_session_coll_list { spinlock_t lock; /* for access to list */ struct list_head list; refcount_t ref_count; }; /* Represents a session (pseudowire) instance. * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics. * Is linked into a per-tunnel session list and a per-net ("global") IDR tree. */ #define L2TP_SESSION_NAME_MAX 32 struct l2tp_session { int magic; /* should be L2TP_SESSION_MAGIC */ long dead; struct rcu_head rcu; struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */ u32 session_id; u32 peer_session_id; u8 cookie[8]; int cookie_len; u8 peer_cookie[8]; int peer_cookie_len; u16 l2specific_type; u16 hdr_len; u32 nr; /* session NR state (receive) */ u32 ns; /* session NR state (send) */ struct sk_buff_head reorder_q; /* receive reorder queue */ u32 nr_max; /* max NR. Depends on tunnel */ u32 nr_window_size; /* NR window size */ u32 nr_oos; /* NR of last OOS packet */ int nr_oos_count; /* for OOS recovery */ int nr_oos_count_max; struct list_head list; /* per-tunnel list node */ refcount_t ref_count; struct hlist_node hlist; /* per-net session hlist */ unsigned long hlist_key; /* key for session hlist */ struct l2tp_session_coll_list *coll_list; /* session collision list */ struct list_head clist; /* for coll_list */ char name[L2TP_SESSION_NAME_MAX]; /* for logging */ char ifname[IFNAMSIZ]; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; struct work_struct del_work; /* Session receive handler for data packets. * Each pseudowire implementation should implement this callback in order to * handle incoming packets. Packets are passed to the pseudowire handler after * reordering, if data sequence numbers are enabled for the session. */ void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); /* Session close handler. * Each pseudowire implementation may implement this callback in order to carry * out pseudowire-specific shutdown actions. * The callback is called by core after unlisting the session and purging its * reorder queue. */ void (*session_close)(struct l2tp_session *session); /* Session show handler. * Pseudowire-specific implementation of debugfs session rendering. * The callback is called by l2tp_debugfs.c after rendering core session * information. */ void (*show)(struct seq_file *m, void *priv); u8 priv[]; /* private data */ }; /* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ struct in_addr local_ip; struct in_addr peer_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr *local_ip6; struct in6_addr *peer_ip6; #endif u16 local_udp_port; u16 peer_udp_port; unsigned int use_udp_checksums:1, udp6_zero_tx_checksums:1, udp6_zero_rx_checksums:1; }; /* Represents a tunnel instance. * Tracks runtime state including IO statistics. * Holds the tunnel socket (either passed from userspace or directly created by the kernel). * Maintains a list of sessions belonging to the tunnel instance. * Is linked into a per-net list of tunnels. */ #define L2TP_TUNNEL_NAME_MAX 20 struct l2tp_tunnel { unsigned long dead; struct rcu_head rcu; spinlock_t list_lock; /* write-protection for session_list */ bool acpt_newsess; /* indicates whether this tunnel accepts * new sessions. Protected by list_lock. */ struct list_head session_list; /* list of sessions */ u32 tunnel_id; u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */ enum l2tp_encap_type encap; struct l2tp_stats stats; struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; struct sock *sock; /* parent socket */ int fd; /* parent fd, if tunnel socket was created * by userspace */ struct work_struct del_work; }; /* Pseudowire ops callbacks for use with the l2tp genetlink interface */ struct l2tp_nl_cmd_ops { /* The pseudowire session create callback is responsible for creating a session * instance for a specific pseudowire type. * It must call l2tp_session_create and l2tp_session_register to register the * session instance, as well as carry out any pseudowire-specific initialisation. * It must return >= 0 on success, or an appropriate negative errno value on failure. */ int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); /* The pseudowire session delete callback is responsible for initiating the deletion * of a session instance. * It must call l2tp_session_delete, as well as carry out any pseudowire-specific * teardown actions. */ void (*session_delete)(struct l2tp_session *session); }; static inline void *l2tp_session_priv(struct l2tp_session *session) { return &session->priv[0]; } /* Tunnel and session refcounts */ void l2tp_tunnel_put(struct l2tp_tunnel *tunnel); void l2tp_session_put(struct l2tp_session *session); /* Tunnel and session lookup. * These functions take a reference on the instances they return, so * the caller must ensure that the reference is dropped appropriately. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_next(const struct net *net, unsigned long *key); struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id); struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id); struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver, u32 tunnel_id, u32 session_id); struct l2tp_session *l2tp_session_get_next(const struct net *net, struct sock *sk, int pver, u32 tunnel_id, unsigned long *key); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); /* Tunnel and session lifetime management. * Creation of a new instance is a two-step process: create, then register. * Destruction is triggered using the *_delete functions, and completes asynchronously. */ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel); void l2tp_session_delete(struct l2tp_session *session); /* Receive path helpers. If data sequencing is enabled for the session these * functions handle queuing and reordering prior to passing packets to the * pseudowire code to be passed to userspace. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); /* Transmit path helpers for sending packets over the tunnel socket. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version, enum l2tp_encap_type encap); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb); /* Pseudowire management. * Pseudowires should register with l2tp core on module init, and unregister * on module exit. */ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ int l2tp_ioctl(struct sock *sk, int cmd, int *karg); struct l2tp_tunnel *l2tp_sk_to_tunnel(const struct sock *sk); static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { switch (session->l2specific_type) { case L2TP_L2SPECTYPE_DEFAULT: return 4; case L2TP_L2SPECTYPE_NONE: default: return 0; } } static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel) { struct dst_entry *dst; u32 mtu; dst = sk_dst_get(tunnel->sock); if (!dst) return 0; mtu = dst_mtu(dst); dst_release(dst); return mtu; } #ifdef CONFIG_XFRM static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; return sk && (rcu_access_pointer(sk->sk_policy[0]) || rcu_access_pointer(sk->sk_policy[1])); } #else static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { return false; } #endif static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb, unsigned char **ptr, unsigned char **optr) { int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session); if (opt_len > 0) { int off = *ptr - *optr; if (!pskb_may_pull(skb, off + opt_len)) return -1; if (skb->data != *optr) { *optr = skb->data; *ptr = skb->data + off; } } return 0; } #define MODULE_ALIAS_L2TP_PWTYPE(type) \ MODULE_ALIAS("net-l2tp-type-" __stringify(type)) #endif /* _L2TP_CORE_H_ */
29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. It is mainly used for debugging and * statistics. * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de> * * Fixes: * Alan Cox : UDP sockets show the rxqueue/txqueue * using hint flag for the netinfo. * Pauline Middelink : identd support * Alan Cox : Make /proc safer. * Erik Schoenfelder : /proc/net/snmp * Alan Cox : Handle dead sockets properly. * Gerhard Koerting : Show both timers * Alan Cox : Allow inode to be NULL (kernel socket) * Andi Kleen : Add support for open_requests and * split functions for more readibility. * Andi Kleen : Add support for /proc/net/netstat * Arnaldo C. Melo : Convert to seq_file */ #include <linux/types.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <net/sock.h> #include <net/raw.h> #define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] */ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; int orphans, sockets; orphans = tcp_orphan_count_sum(); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1, sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), proto_memory_allocated(&udp_prot)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv4.fqdir)); return 0; } /* snmp items */ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), }; /* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS), SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS), SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), }; static const struct { const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, { "TimeExcds", ICMP_TIME_EXCEEDED }, { "ParmProbs", ICMP_PARAMETERPROB }, { "SrcQuenchs", ICMP_SOURCE_QUENCH }, { "Redirects", ICMP_REDIRECT }, { "Echos", ICMP_ECHO }, { "EchoReps", ICMP_ECHOREPLY }, { "Timestamps", ICMP_TIMESTAMP }, { "TimestampReps", ICMP_TIMESTAMPREPLY }, { "AddrMasks", ICMP_ADDRESS }, { "AddrMaskReps", ICMP_ADDRESSREPLY }, { NULL, 0 } }; static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), }; static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), }; static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW), SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO), SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED), SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD), SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, unsigned short *type, int count) { int j; if (count) { seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } } static void icmpmsg_put(struct seq_file *seq) { #define PERLINE 16 int i, count; unsigned short type[PERLINE]; unsigned long vals[PERLINE], val; struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); if (val) { type[count] = i; vals[count++] = val; } if (count == PERLINE) { icmpmsg_put_line(seq, vals, type, count); count = 0; } } icmpmsg_put_line(seq, vals, type, count); #undef PERLINE } static void icmp_put(struct seq_file *seq) { int i; struct net *net = seq->private; atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { const int cnt = ARRAY_SIZE(snmp4_ipstats_list); u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)]; struct net *net = seq->private; int i; memset(buff64, 0, sizeof(buff64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); for (i = 0; i < cnt; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < cnt; i++) seq_printf(seq, " %llu", buff64[i]); return 0; } static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { const int udp_cnt = ARRAY_SIZE(snmp4_udp_list); const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list); unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, tcp_cnt * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list, tcp_cnt, net->mib.tcp_statistics); for (i = 0; i < tcp_cnt; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); else seq_printf(seq, " %lu", buff[i]); } memset(buff, 0, udp_cnt * sizeof(unsigned long)); snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, udp_cnt, net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); memset(buff, 0, udp_cnt * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, udp_cnt, net->mib.udplite_statistics); for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); return 0; } static int snmp_seq_show(struct seq_file *seq, void *v) { snmp_seq_show_ipstats(seq, v); icmp_put(seq); /* RFC 2011 compatibility */ icmpmsg_put(seq); snmp_seq_show_tcp_udp(seq, v); return 0; } /* * Output /proc/net/netstat */ static int netstat_seq_show(struct seq_file *seq, void *v) { const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list); const int tcp_cnt = ARRAY_SIZE(snmp4_net_list); struct net *net = seq->private; unsigned long *buff; int i; seq_puts(seq, "TcpExt:"); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt, net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); } seq_puts(seq, "\nIpExt:"); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); if (buff) { u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", buff64[i]); } else { for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); } kfree(buff); seq_putc(seq, '\n'); mptcp_seq_show(seq); return 0; } static __net_init int ip_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat", 0444, net->proc_net, sockstat_seq_show, NULL)) goto out_sockstat; if (!proc_create_net_single("netstat", 0444, net->proc_net, netstat_seq_show, NULL)) goto out_netstat; if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show, NULL)) goto out_snmp; return 0; out_snmp: remove_proc_entry("netstat", net->proc_net); out_netstat: remove_proc_entry("sockstat", net->proc_net); out_sockstat: return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { remove_proc_entry("snmp", net->proc_net); remove_proc_entry("netstat", net->proc_net); remove_proc_entry("sockstat", net->proc_net); } static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, }; int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); }
8 631 630 575 631 630 563 561 494 494 494 494 493 494 494 493 494 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RMAP_H #define _LINUX_RMAP_H /* * Declarations for Reverse Mapping functions in mm/rmap.c */ #include <linux/list.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/rwsem.h> #include <linux/memcontrol.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> #include <linux/bit_spinlock.h> /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: * the vmas on the list will be related by forking, or by splitting. * * Since vmas come and go as they are split and merged (particularly * in mprotect), the mapping field of an anonymous page cannot point * directly to a vma: instead it points to an anon_vma, on whose list * the related vmas can be easily linked or unlinked. * * After unlinking the last vma on the list, we must garbage collect * the anon_vma object itself: we're guaranteed no page can be * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { struct anon_vma *root; /* Root of this anon_vma tree */ struct rw_semaphore rwsem; /* W: modification, R: walking the list */ /* * The refcount is taken on an anon_vma when there is no * guarantee that the vma of page tables will exist for * the duration of the operation. A caller that takes * the reference is responsible for clearing up the * anon_vma if they are the last user on release */ atomic_t refcount; /* * Count of child anon_vmas. Equals to the count of all anon_vmas that * have ->parent pointing to this one, including itself. * * This counter is used for making decision about reusing anon_vma * instead of forking new one. See comments in function anon_vma_clone. */ unsigned long num_children; /* Count of VMAs whose ->anon_vma pointer points to this object. */ unsigned long num_active_vmas; struct anon_vma *parent; /* Parent of this anon_vma */ /* * NOTE: the LSB of the rb_root.rb_node is set by * mm_take_all_locks() _after_ taking the above lock. So the * rb_root must only be read/written after taking the above lock * to be sure to see a valid next pointer. The LSB bit itself * is serialized by a system wide lock only visible to * mm_take_all_locks() (mm_all_locks_mutex). */ /* Interval tree of private "related" vmas */ struct rb_root_cached rb_root; }; /* * The copy-on-write semantics of fork mean that an anon_vma * can become associated with multiple processes. Furthermore, * each child process will have its own anon_vma, where new * pages for that process are instantiated. * * This structure allows us to find the anon_vmas associated * with a VMA, or the VMAs associated with an anon_vma. * The "same_vma" list contains the anon_vma_chains linking * all the anon_vmas associated with this VMA. * The "rb" field indexes on an interval tree the anon_vma_chains * which link all the VMAs associated with this anon_vma. */ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ struct rb_node rb; /* locked by anon_vma->rwsem */ unsigned long rb_subtree_last; #ifdef CONFIG_DEBUG_VM_RB unsigned long cached_vma_start, cached_vma_last; #endif }; enum ttu_flags { TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: * caller holds it */ }; #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) { atomic_inc(&anon_vma->refcount); } void __put_anon_vma(struct anon_vma *anon_vma); static inline void put_anon_vma(struct anon_vma *anon_vma) { if (atomic_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma); } static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); } static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) { return down_write_trylock(&anon_vma->root->rwsem); } static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { up_write(&anon_vma->root->rwsem); } static inline void anon_vma_lock_read(struct anon_vma *anon_vma) { down_read(&anon_vma->root->rwsem); } static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) { return down_read_trylock(&anon_vma->root->rwsem); } static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); } /* * anon_vma helper functions. */ void anon_vma_init(void); /* create anon_vma_cachep */ int __anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); static inline int anon_vma_prepare(struct vm_area_struct *vma) { if (likely(vma->anon_vma)) return 0; return __anon_vma_prepare(vma); } static inline void anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); unlink_anon_vmas(next); } struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID static __always_inline void folio_lock_large_mapcount(struct folio *folio) { bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); } static __always_inline void folio_unlock_large_mapcount(struct folio *folio) { __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); } static inline unsigned int folio_mm_id(const struct folio *folio, int idx) { VM_WARN_ON_ONCE(idx != 0 && idx != 1); return folio->_mm_id[idx] & MM_ID_MASK; } static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id) { VM_WARN_ON_ONCE(idx != 0 && idx != 1); folio->_mm_id[idx] &= ~MM_ID_MASK; folio->_mm_id[idx] |= id; } static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio, int diff, mm_id_t mm_id) { VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio)); VM_WARN_ON_ONCE(diff <= 0); VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX); /* * Make sure we can detect at least one complete PTE mapping of the * folio in a single MM as "exclusively mapped". This is primarily * a check on 32bit, where we currently reduce the size of the per-MM * mapcount to a short. */ VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio)); VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY && folio->_mm_id_mapcount[0] != -1); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY && folio->_mm_id_mapcount[0] < 0); VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY && folio->_mm_id_mapcount[1] != -1); VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && folio->_mm_id_mapcount[1] < 0); VM_WARN_ON_ONCE(!folio_mapped(folio) && test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)); } static __always_inline void folio_set_large_mapcount(struct folio *folio, int mapcount, struct vm_area_struct *vma) { __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY); VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY); /* Note: mapcounts start at -1. */ atomic_set(&folio->_large_mapcount, mapcount - 1); folio->_mm_id_mapcount[0] = mapcount - 1; folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); } static __always_inline int folio_add_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { const mm_id_t mm_id = vma->vm_mm->mm_id; int new_mapcount_val; folio_lock_large_mapcount(folio); __folio_large_mapcount_sanity_checks(folio, diff, mm_id); new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff; atomic_set(&folio->_large_mapcount, new_mapcount_val); /* * If a folio is mapped more than once into an MM on 32bit, we * can in theory overflow the per-MM mapcount (although only for * fairly large folios), turning it negative. In that case, just * free up the slot and mark the folio "mapped shared", otherwise * we might be in trouble when unmapping pages later. */ if (folio_mm_id(folio, 0) == mm_id) { folio->_mm_id_mapcount[0] += diff; if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) { folio->_mm_id_mapcount[0] = -1; folio_set_mm_id(folio, 0, MM_ID_DUMMY); folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; } } else if (folio_mm_id(folio, 1) == mm_id) { folio->_mm_id_mapcount[1] += diff; if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) { folio->_mm_id_mapcount[1] = -1; folio_set_mm_id(folio, 1, MM_ID_DUMMY); folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; } } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) { folio_set_mm_id(folio, 0, mm_id); folio->_mm_id_mapcount[0] = diff - 1; /* We might have other mappings already. */ if (new_mapcount_val != diff - 1) folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) { folio_set_mm_id(folio, 1, mm_id); folio->_mm_id_mapcount[1] = diff - 1; /* Slot 0 certainly has mappings as well. */ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; } folio_unlock_large_mapcount(folio); return new_mapcount_val + 1; } #define folio_add_large_mapcount folio_add_return_large_mapcount static __always_inline int folio_sub_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { const mm_id_t mm_id = vma->vm_mm->mm_id; int new_mapcount_val; folio_lock_large_mapcount(folio); __folio_large_mapcount_sanity_checks(folio, diff, mm_id); new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff; atomic_set(&folio->_large_mapcount, new_mapcount_val); /* * There are valid corner cases where we might underflow a per-MM * mapcount (some mappings added when no slot was free, some mappings * added once a slot was free), so we always set it to -1 once we go * negative. */ if (folio_mm_id(folio, 0) == mm_id) { folio->_mm_id_mapcount[0] -= diff; if (folio->_mm_id_mapcount[0] >= 0) goto out; folio->_mm_id_mapcount[0] = -1; folio_set_mm_id(folio, 0, MM_ID_DUMMY); } else if (folio_mm_id(folio, 1) == mm_id) { folio->_mm_id_mapcount[1] -= diff; if (folio->_mm_id_mapcount[1] >= 0) goto out; folio->_mm_id_mapcount[1] = -1; folio_set_mm_id(folio, 1, MM_ID_DUMMY); } /* * If one MM slot owns all mappings, the folio is mapped exclusively. * Note that if the folio is now unmapped (new_mapcount_val == -1), both * slots must be free (mapcount == -1), and we'll also mark it as * exclusive. */ if (folio->_mm_id_mapcount[0] == new_mapcount_val || folio->_mm_id_mapcount[1] == new_mapcount_val) folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; out: folio_unlock_large_mapcount(folio); return new_mapcount_val + 1; } #define folio_sub_large_mapcount folio_sub_return_large_mapcount #else /* !CONFIG_MM_ID */ /* * See __folio_rmap_sanity_checks(), we might map large folios even without * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now. */ static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, struct vm_area_struct *vma) { /* Note: mapcounts start at -1. */ atomic_set(&folio->_large_mapcount, mapcount - 1); } static inline void folio_add_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { atomic_add(diff, &folio->_large_mapcount); } static inline int folio_add_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { BUILD_BUG(); } static inline void folio_sub_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { atomic_sub(diff, &folio->_large_mapcount); } static inline int folio_sub_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { BUILD_BUG(); } #endif /* CONFIG_MM_ID */ #define folio_inc_large_mapcount(folio, vma) \ folio_add_large_mapcount(folio, 1, vma) #define folio_inc_return_large_mapcount(folio, vma) \ folio_add_return_large_mapcount(folio, 1, vma) #define folio_dec_large_mapcount(folio, vma) \ folio_sub_large_mapcount(folio, 1, vma) #define folio_dec_return_large_mapcount(folio, vma) \ folio_sub_return_large_mapcount(folio, 1, vma) /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; /* * No special request: A mapped anonymous (sub)page is possibly shared between * processes. */ #define RMAP_NONE ((__force rmap_t)0) /* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio, const struct page *page, int nr_pages, enum pgtable_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); /* When (un)mapping zeropages, we should never touch ref+mapcount. */ VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); /* * TODO: we get driver-allocated folios that have nothing to do with * the rmap using vm_insert_page(); therefore, we cannot assume that * folio_test_large_rmappable() holds for large folios. We should * handle any desired mapcount+stats accounting for these folios in * VM_MIXEDMAP VMAs separately, and then sanity-check here that * we really only get rmappable folios. */ VM_WARN_ON_ONCE(nr_pages <= 0); VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); switch (level) { case PGTABLE_LEVEL_PTE: break; case PGTABLE_LEVEL_PMD: /* * We don't support folios larger than a single PMD yet. So * when PGTABLE_LEVEL_PMD is set, we assume that we are creating * a single "entire" mapping of the folio. */ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; case PGTABLE_LEVEL_PUD: /* * Assume that we are creating a single "entire" mapping of the * folio. */ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); break; default: BUILD_BUG(); } /* * Anon folios must have an associated live anon_vma as long as they're * mapped into userspace. * Note that the atomic_read() mainly does two things: * * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to * check that the associated anon_vma has not yet been freed (subject * to KASAN's usual limitations). This check will pass if the * anon_vma's refcount has already dropped to 0 but an RCU grace * period hasn't passed since then. * 2. If the anon_vma has not yet been freed, it checks that the * anon_vma still has a nonzero refcount (as opposed to being in the * middle of an RCU delay for getting freed). */ if (folio_test_anon(folio) && !folio_test_ksm(folio)) { unsigned long mapping = (unsigned long)folio->mapping; struct anon_vma *anon_vma; anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); } } /* * rmap interfaces called when adding or removing pte of page */ void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *, unsigned long address, rmap_t flags); #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ folio_add_file_rmap_ptes(folio, page, 1, vma) void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); void folio_add_file_rmap_pud(struct folio *, struct page *, struct vm_area_struct *); void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_remove_rmap_pte(folio, page, vma) \ folio_remove_rmap_ptes(folio, page, 1, vma) void folio_remove_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); void folio_remove_rmap_pud(struct folio *, struct page *, struct vm_area_struct *); void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); /* See folio_try_dup_anon_rmap_*() */ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); if (PageAnonExclusive(&folio->page)) { if (unlikely(folio_needs_cow_for_dma(vma, folio))) return -EBUSY; ClearPageAnonExclusive(&folio->page); } atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); return 0; } /* See folio_try_share_anon_rmap_*() */ static inline int hugetlb_try_share_anon_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); /* Paired with the memory barrier in try_grab_folio(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb(); if (unlikely(folio_maybe_dma_pinned(folio))) return -EBUSY; ClearPageAnonExclusive(&folio->page); /* * This is conceptually a smp_wmb() paired with the smp_rmb() in * gup_must_unshare(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb__after_atomic(); return 0; } static inline void hugetlb_add_file_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); } static inline void hugetlb_remove_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); atomic_dec(&folio->_entire_mapcount); atomic_dec(&folio->_large_mapcount); } static __always_inline void __folio_dup_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, enum pgtable_level level) { const int orig_nr_pages = nr_pages; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { atomic_inc(&folio->_mapcount); break; } if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { do { atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); } folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case PGTABLE_LEVEL_PMD: case PGTABLE_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; default: BUILD_BUG(); } } /** * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE); } /** * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif } static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, enum pgtable_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; int i; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); __folio_rmap_sanity_checks(folio, page, nr_pages, level); /* * If this folio may have been pinned by the parent process, * don't allow to duplicate the mappings but instead require to e.g., * copy the subpage immediately for the child so that we'll always * guarantee the pinned folio won't be randomly replaced in the * future on write faults. */ maybe_pinned = likely(!folio_is_device_private(folio)) && unlikely(folio_needs_cow_for_dma(src_vma, folio)); /* * No need to check+clear for already shared PTEs/PMDs of the * folio. But if any page is PageAnonExclusive, we must fallback to * copying if the folio maybe pinned. */ switch (level) { case PGTABLE_LEVEL_PTE: if (unlikely(maybe_pinned)) { for (i = 0; i < nr_pages; i++) if (PageAnonExclusive(page + i)) return -EBUSY; } if (!folio_test_large(folio)) { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); atomic_inc(&folio->_mapcount); break; } do { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case PGTABLE_LEVEL_PMD: case PGTABLE_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; ClearPageAnonExclusive(page); } atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; default: BUILD_BUG(); } return 0; } /** * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range * of a folio * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated * @dst_vma: The destination vm area * @src_vma: The vm area from which the mappings are duplicated * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock and the * vma->vma_mm->write_protect_seq. * * Duplicating the mappings can only fail if the folio may be pinned; device * private folios cannot get pinned and consequently this function cannot fail * for them. * * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in * the parent and the child. They must *not* be writable after this call * succeeded. * * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, src_vma, PGTABLE_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, PGTABLE_LEVEL_PTE); } /** * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range * of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of * @dst_vma: The destination vm area * @src_vma: The vm area from which the mapping is duplicated * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock and the * vma->vma_mm->write_protect_seq. * * Duplicating the mapping can only fail if the folio may be pinned; device * private folios cannot get pinned and consequently this function cannot fail * for them. * * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in * the parent and the child. They must *not* be writable after this call * succeeded. * * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, src_vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; #endif } static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, struct page *page, int nr_pages, enum pgtable_level level) { VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); __folio_rmap_sanity_checks(folio, page, nr_pages, level); /* device private folios cannot get pinned via GUP. */ if (unlikely(folio_is_device_private(folio))) { ClearPageAnonExclusive(page); return 0; } /* * We have to make sure that when we clear PageAnonExclusive, that * the page is not pinned and that concurrent GUP-fast won't succeed in * concurrently pinning the page. * * Conceptually, PageAnonExclusive clearing consists of: * (A1) Clear PTE * (A2) Check if the page is pinned; back off if so. * (A3) Clear PageAnonExclusive * (A4) Restore PTE (optional, but certainly not writable) * * When clearing PageAnonExclusive, we cannot possibly map the page * writable again, because anon pages that may be shared must never * be writable. So in any case, if the PTE was writable it cannot * be writable anymore afterwards and there would be a PTE change. Only * if the PTE wasn't writable, there might not be a PTE change. * * Conceptually, GUP-fast pinning of an anon page consists of: * (B1) Read the PTE * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. * (B3) Pin the mapped page * (B4) Check if the PTE changed by re-reading it; back off if so. * (B5) If the original PTE is not writable, check if * PageAnonExclusive is not set; back off if so. * * If the PTE was writable, we only have to make sure that GUP-fast * observes a PTE change and properly backs off. * * If the PTE was not writable, we have to make sure that GUP-fast either * detects a (temporary) PTE change or that PageAnonExclusive is cleared * and properly backs off. * * Consequently, when clearing PageAnonExclusive(), we have to make * sure that (A1), (A2)/(A3) and (A4) happen in the right memory * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) * and (B5) happen in the right memory order. * * We assume that there might not be a memory barrier after * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), * so we use explicit ones here. */ /* Paired with the memory barrier in try_grab_folio(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb(); if (unlikely(folio_maybe_dma_pinned(folio))) return -EBUSY; ClearPageAnonExclusive(page); /* * This is conceptually a smp_wmb() paired with the smp_rmb() in * gup_must_unshare(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb__after_atomic(); return 0; } /** * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page * mapped by a PTE possibly shared to prepare * for KSM or temporary unmapping * @folio: The folio to share a mapping of * @page: The mapped exclusive page * * The caller needs to hold the page table lock and has to have the page table * entries cleared/invalidated. * * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during * fork() to duplicate mappings, but instead to prepare for KSM or temporarily * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). * * Marking the mapped page shared can only fail if the folio maybe pinned; * device private folios cannot get pinned and consequently this function cannot * fail. * * Returns 0 if marking the mapped page possibly shared succeeded. Returns * -EBUSY otherwise. */ static inline int folio_try_share_anon_rmap_pte(struct folio *folio, struct page *page) { return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE); } /** * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page * range mapped by a PMD possibly shared to * prepare for temporary unmapping * @folio: The folio to share the mapping of * @page: The first page to share the mapping of * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock and has to have the page table * entries cleared/invalidated. * * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during * fork() to duplicate a mapping, but instead to prepare for temporarily * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). * * Marking the mapped pages shared can only fail if the folio maybe pinned; * device private folios cannot get pinned and consequently this function cannot * fail. * * Returns 0 if marking the mapped pages possibly shared succeeded. Returns * -EBUSY otherwise. */ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; #endif } /* * Called from mm/vmscan.c to handle paging out */ int folio_referenced(struct folio *, int is_locked, struct mem_cgroup *memcg, vm_flags_t *vm_flags); void try_to_migrate(struct folio *folio, enum ttu_flags flags); void try_to_unmap(struct folio *, enum ttu_flags flags); struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, void *owner, struct folio **foliop); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) /* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) /* Result flags */ /* The page is mapped across page table boundary */ #define PVMW_PGTABLE_CROSSED (1 << 16) struct page_vma_mapped_walk { unsigned long pfn; unsigned long nr_pages; pgoff_t pgoff; struct vm_area_struct *vma; unsigned long address; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; unsigned int flags; }; #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = folio_pfn(_folio), \ .nr_pages = folio_nr_pages(_folio), \ .pgoff = folio_pgoff(_folio), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ } static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) pte_unmap(pvmw->pte); if (pvmw->ptl) spin_unlock(pvmw->ptl); } /** * page_vma_mapped_walk_restart - Restart the page table walk. * @pvmw: Pointer to struct page_vma_mapped_walk. * * It restarts the page table walk when changes occur in the page * table, such as splitting a PMD. Ensures that the PTL held during * the previous walk is released and resets the state to allow for * a new walk starting at the current address stored in pvmw->address. */ static inline void page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) { WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); if (likely(pvmw->ptl)) spin_unlock(pvmw->ptl); else WARN_ON_ONCE(1); pvmw->ptl = NULL; pvmw->pmd = NULL; pvmw->pte = NULL; } bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); unsigned long page_address_in_vma(const struct folio *folio, const struct page *, const struct vm_area_struct *); /* * Cleans the PTEs of shared mappings. * (and since clean PTEs should also be readonly, write protects them too) * * returns the number of cleaned PTEs. */ int folio_mkclean(struct folio *); int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, unsigned long pfn, unsigned long nr_pages); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); enum rmp_flags { RMP_LOCKED = 1 << 0, RMP_USE_SHARED_ZEROPAGE = 1 << 1, }; void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() * try_lock: bail out if the rmap lock is contended * contended: indicate the rmap traversal bailed out due to lock contention * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ struct rmap_walk_control { void *arg; bool try_lock; bool contended; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. */ bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); struct anon_vma *(*anon_lock)(const struct folio *folio, struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) #define anon_vma_prepare(vma) (0) static inline int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, vm_flags_t *vm_flags) { *vm_flags = 0; return 0; } static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) { } static inline int folio_mkclean(struct folio *folio) { return 0; } #endif /* CONFIG_MMU */ #endif /* _LINUX_RMAP_H */
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 /* SPDX-License-Identifier: GPL-2.0 */ /* interrupt.h */ #ifndef _LINUX_INTERRUPT_H #define _LINUX_INTERRUPT_H #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cleanup.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/hrtimer.h> #include <linux/kref.h> #include <linux/cpumask_types.h> #include <linux/workqueue.h> #include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/sections.h> /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When * requesting an interrupt without specifying a IRQF_TRIGGER, the * setting should be assumed to be "as already configured", which * may be as per machine or firmware initialisation. */ #define IRQF_TRIGGER_NONE 0x00000000 #define IRQF_TRIGGER_RISING 0x00000001 #define IRQF_TRIGGER_FALLING 0x00000002 #define IRQF_TRIGGER_HIGH 0x00000004 #define IRQF_TRIGGER_LOW 0x00000008 #define IRQF_TRIGGER_MASK (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \ IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING) #define IRQF_TRIGGER_PROBE 0x00000010 /* * These flags used only by the kernel as part of the * irq handling routines. * * IRQF_SHARED - allow sharing the irq among several devices * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur * IRQF_TIMER - Flag to mark this interrupt as timer interrupt * IRQF_PERCPU - Interrupt is per cpu * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is * registered first in a shared interrupt is considered for * performance reasons) * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee * that this interrupt will wake the system from a suspended * state. See Documentation/power/suspend-and-interrupts.rst * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device * resume time. * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this * interrupt handler after suspending interrupts. For system * wakeup devices users need to implement wakeup detection in * their interrupt handlers. * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it. * Users will enable it explicitly by enable_irq() or enable_nmi() * later. * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers, * depends on IRQF_PERCPU. * IRQF_COND_ONESHOT - Agree to do IRQF_ONESHOT if already set for a shared * interrupt. */ #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 #define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 #define IRQF_FORCE_RESUME 0x00008000 #define IRQF_NO_THREAD 0x00010000 #define IRQF_EARLY_RESUME 0x00020000 #define IRQF_COND_SUSPEND 0x00040000 #define IRQF_NO_AUTOEN 0x00080000 #define IRQF_NO_DEBUG 0x00100000 #define IRQF_COND_ONESHOT 0x00200000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) /* * These values can be returned by request_any_context_irq() and * describe the context the interrupt will be run in. * * IRQC_IS_HARDIRQ - interrupt runs in hardirq context * IRQC_IS_NESTED - interrupt runs in a nested threaded context */ enum { IRQC_IS_HARDIRQ = 0, IRQC_IS_NESTED, }; typedef irqreturn_t (*irq_handler_t)(int, void *); /** * struct irqaction - per interrupt action descriptor * @handler: interrupt handler function * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device * @affinity: CPUs this irqaction is allowed to run on * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) * @thread_fn: interrupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @secondary: pointer to secondary irqaction (force threading) * @thread_flags: flags related to @thread * @thread_mask: bitmask for keeping track of @thread activity * @dir: pointer to the proc/irq/NN/name entry */ struct irqaction { irq_handler_t handler; union { void *dev_id; void __percpu *percpu_dev_id; }; const struct cpumask *affinity; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; struct irqaction *secondary; unsigned int irq; unsigned int flags; unsigned long thread_flags; unsigned long thread_mask; const char *name; struct proc_dir_entry *dir; } ____cacheline_internodealigned_in_smp; extern irqreturn_t no_action(int cpl, void *dev_id); /* * If a (PCI) device interrupt is not connected we set dev->irq to * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we * can distinguish that case from other error returns. * * 0x80000000 is guaranteed to be outside the available range of interrupts * and easy to distinguish from other possible incorrect values. */ #define IRQ_NOTCONNECTED (1U << 31) extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long flags, const char *name, void *dev); /** * request_irq - Add a handler for an interrupt line * @irq: The interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts * If NULL, the default primary handler is installed * @flags: Handling flags * @name: Name of the device generating this interrupt * @dev: A cookie passed to the handler function * * This call allocates an interrupt and establishes a handler; see * the documentation for request_threaded_irq() for details. */ static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { return request_threaded_irq(irq, handler, NULL, flags | IRQF_COND_ONESHOT, name, dev); } extern int __must_check request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id); extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, const cpumask_t *affinity, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev); static inline int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, devname, NULL, percpu_dev_id); } static inline int __must_check request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler, const char *devname, const cpumask_t *affinity, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, devname, affinity, percpu_dev_id); } extern int __must_check request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, const struct cpumask *affinity, void __percpu *dev_id); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); extern const void *free_nmi(unsigned int irq, void *dev_id); extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id); struct device; extern int __must_check devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id); static inline int __must_check devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags, devname, dev_id); } extern int __must_check devm_request_any_context_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); extern void disable_percpu_irq(unsigned int irq); extern void enable_irq(unsigned int irq); extern void enable_percpu_irq(unsigned int irq, unsigned int type); extern bool irq_percpu_is_enabled(unsigned int irq); extern void irq_wake_thread(unsigned int irq, void *dev_id); DEFINE_LOCK_GUARD_1(disable_irq, int, disable_irq(*_T->lock), enable_irq(*_T->lock)) extern void disable_nmi_nosync(unsigned int irq); extern void disable_percpu_nmi(unsigned int irq); extern void enable_nmi(unsigned int irq); extern void enable_percpu_nmi(unsigned int irq, unsigned int type); extern int prepare_percpu_nmi(unsigned int irq); extern void teardown_percpu_nmi(unsigned int irq); extern int irq_inject_interrupt(unsigned int irq); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); extern void rearm_wake_irq(unsigned int irq); /** * struct irq_affinity_notify - context for notification of IRQ affinity changes * @irq: Interrupt to which notification applies * @kref: Reference count, for internal use * @work: Work item, for internal use * @notify: Function to be called on change. This will be * called in process context. * @release: Function to be called on release. This will be * called in process context. Once registered, the * structure must only be freed when this function is * called or later. */ struct irq_affinity_notify { unsigned int irq; struct kref kref; struct work_struct work; void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); void (*release)(struct kref *ref); }; #define IRQ_AFFINITY_MAX_SETS 4 /** * struct irq_affinity - Description for automatic irq affinity assignments * @pre_vectors: Don't apply affinity to @pre_vectors at beginning of * the MSI(-X) vector space * @post_vectors: Don't apply affinity to @post_vectors at end of * the MSI(-X) vector space * @nr_sets: The number of interrupt sets for which affinity * spreading is required * @set_size: Array holding the size of each interrupt set * @calc_sets: Callback for calculating the number and size * of interrupt sets * @priv: Private data for usage by @calc_sets, usually a * pointer to driver/device specific data. */ struct irq_affinity { unsigned int pre_vectors; unsigned int post_vectors; unsigned int nr_sets; unsigned int set_size[IRQ_AFFINITY_MAX_SETS]; void (*calc_sets)(struct irq_affinity *, unsigned int nvecs); void *priv; }; /** * struct irq_affinity_desc - Interrupt affinity descriptor * @mask: cpumask to hold the affinity assignment * @is_managed: 1 if the interrupt is managed internally */ struct irq_affinity_desc { struct cpumask mask; unsigned int is_managed : 1; }; #if defined(CONFIG_SMP) extern cpumask_var_t irq_default_affinity; extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity); /** * irq_update_affinity_hint - Update the affinity hint * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint, but does not change the affinity of the interrupt. */ static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, false); } /** * irq_set_affinity_and_hint - Update the affinity hint and apply the provided * cpumask to the interrupt * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint and if @m is not NULL it applies it as the * affinity of that interrupt. */ static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, true); } /* * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint() * instead. */ static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return irq_set_affinity_and_hint(irq, m); } extern int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd); unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return 0; } static inline int irq_can_set_affinity(unsigned int irq) { return 0; } static inline int irq_select_affinity(unsigned int irq) { return 0; } static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { return -EINVAL; } static inline int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { return 0; } static inline struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd) { return NULL; } static inline unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd) { return maxvec; } #endif /* CONFIG_SMP */ /* * Special lockdep variants of irq disabling/enabling. * These should be used for locking constructs that * know that a particular irq context which is disabled, * and which is the only irq-context user of a lock, * that it's safe to take the lock in the irq-disabled * section without disabling hardirqs. * * On !CONFIG_LOCKDEP they are equivalent to the normal * irq disable/enable methods. */ static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_disable(); #endif } static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_save(*flags); #endif } static inline void enable_irq_lockdep(unsigned int irq) { #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_enable(); #endif enable_irq(irq); } static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_restore(*flags); #endif enable_irq(irq); } /* IRQ wakeup (PM) control: */ extern int irq_set_irq_wake(unsigned int irq, unsigned int on); static inline int enable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 1); } static inline int disable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 0); } /* * irq_get_irqchip_state/irq_set_irqchip_state specific flags */ enum irqchip_irq_state { IRQCHIP_STATE_PENDING, /* Is interrupt pending? */ IRQCHIP_STATE_ACTIVE, /* Is interrupt in progress? */ IRQCHIP_STATE_MASKED, /* Is interrupt masked? */ IRQCHIP_STATE_LINE_LEVEL, /* Is IRQ line high? */ }; extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state); extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool state); #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) # endif #else #define force_irqthreads() (false) #endif #ifndef local_softirq_pending #ifndef local_softirq_pending_ref #define local_softirq_pending_ref irq_stat.__softirq_pending #endif #define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref)) #define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x))) #define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x))) #endif /* local_softirq_pending */ /* Some architectures might implement lazy enabling/disabling of * interrupts. In some cases, such as stop_machine, we might want * to ensure that after a local_irq_disable(), interrupts have * really been disabled in hardware. Such architectures need to * implement the following hook. */ #ifndef hard_irq_disable #define hard_irq_disable() do { } while(0) #endif /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high frequency threaded job scheduling. For almost all the purposes tasklets are more than enough. F.e. all serial device BHs et al. should be converted to tasklets, not to softirqs. */ enum { HI_SOFTIRQ=0, TIMER_SOFTIRQ, NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, IRQ_POLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; /* * The following vectors can be safely ignored after ksoftirqd is parked: * * _ RCU: * 1) rcutree_migrate_callbacks() migrates the queue. * 2) rcutree_report_cpu_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue * * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue */ #define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\ BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ)) /* map softirq index to softirq name. update 'softirq_to_name' in * kernel/softirq.c when adding a new softirq. */ extern const char * const softirq_to_name[NR_SOFTIRQS]; /* softirq mask and active fields moved to irq_cpustat_t in * asm/hardirq.h to get better cache usage. KAO */ struct softirq_action { void (*action)(void); }; asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); #ifdef CONFIG_PREEMPT_RT extern void do_softirq_post_smp_call_flush(unsigned int was_pending); #else static inline void do_softirq_post_smp_call_flush(unsigned int unused) { do_softirq(); } #endif extern void open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); /* * With forced-threaded interrupts enabled a raised softirq is deferred to * ksoftirqd unless it can be handled within the threaded interrupt. This * affects timer_list timers and hrtimers which are explicitly marked with * HRTIMER_MODE_SOFT. * With PREEMPT_RT enabled more hrtimers are moved to softirq for processing * which includes all timers which are not explicitly marked HRTIMER_MODE_HARD. * Userspace controlled timers (like the clock_nanosleep() interface) is divided * into two categories: Tasks with elevated scheduling policy including * SCHED_{FIFO|RR|DL} and the remaining scheduling policy. The tasks with the * elevated scheduling policy are woken up directly from the HARDIRQ while all * other wake ups are delayed to softirq and so to ksoftirqd. * * The ksoftirqd runs at SCHED_OTHER policy at which it should remain since it * handles the softirq in an overloaded situation (not handled everything * within its last run). * If the timers are handled at SCHED_OTHER priority then they competes with all * other SCHED_OTHER tasks for CPU resources are possibly delayed. * Moving timers softirqs to a low priority SCHED_FIFO thread instead ensures * that timer are performed before scheduling any SCHED_OTHER thread. */ DECLARE_PER_CPU(struct task_struct *, ktimerd); DECLARE_PER_CPU(unsigned long, pending_timer_softirq); void raise_ktimers_thread(unsigned int nr); static inline unsigned int local_timers_pending_force_th(void) { return __this_cpu_read(pending_timer_softirq); } static inline void raise_timer_softirq(unsigned int nr) { lockdep_assert_in_irq(); if (force_irqthreads()) raise_ktimers_thread(nr); else __raise_softirq_irqoff(nr); } static inline unsigned int local_timers_pending(void) { if (force_irqthreads()) return local_timers_pending_force_th(); else return local_softirq_pending(); } DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) { return this_cpu_read(ksoftirqd); } /* Tasklets --- multithreaded analogue of BHs. This API is deprecated. Please consider using threaded IRQs instead: https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de Main feature differing them of generic softirqs: tasklet is running only on one CPU simultaneously. Main feature differing them of BHs: different tasklets may be run simultaneously on different CPUs. Properties: * If tasklet_schedule() is called, then tasklet is guaranteed to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its execution is still not started, it will be executed only once. * If this tasklet is already running on another CPU (or schedule is called from tasklet itself), it is rescheduled for later. * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. */ struct tasklet_struct { struct tasklet_struct *next; unsigned long state; atomic_t count; bool use_callback; union { void (*func)(unsigned long data); void (*callback)(struct tasklet_struct *t); }; unsigned long data; }; #define DECLARE_TASKLET(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .callback = _callback, \ .use_callback = true, \ } #define DECLARE_TASKLET_DISABLED(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .callback = _callback, \ .use_callback = true, \ } #define from_tasklet(var, callback_tasklet, tasklet_fieldname) \ container_of(callback_tasklet, typeof(*var), tasklet_fieldname) #define DECLARE_TASKLET_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .func = _func, \ } #define DECLARE_TASKLET_DISABLED_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .func = _func, \ } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ }; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } void tasklet_unlock(struct tasklet_struct *t); void tasklet_unlock_wait(struct tasklet_struct *t); void tasklet_unlock_spin_wait(struct tasklet_struct *t); #else static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } static inline void tasklet_unlock(struct tasklet_struct *t) { } static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } #endif extern void __tasklet_schedule(struct tasklet_struct *t); static inline void tasklet_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_schedule(t); } extern void __tasklet_hi_schedule(struct tasklet_struct *t); static inline void tasklet_hi_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_hi_schedule(t); } static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); smp_mb__after_atomic(); } /* * Do not use in new code. Disabling tasklets from atomic contexts is * error prone and should be avoided. */ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_spin_wait(t); smp_mb(); } static inline void tasklet_disable(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_wait(t); smp_mb(); } static inline void tasklet_enable(struct tasklet_struct *t) { smp_mb__before_atomic(); atomic_dec(&t->count); } extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); extern void tasklet_setup(struct tasklet_struct *t, void (*callback)(struct tasklet_struct *)); /* * Autoprobing for irqs: * * probe_irq_on() and probe_irq_off() provide robust primitives * for accurate IRQ probing during kernel initialization. They are * reasonably simple to use, are not "fooled" by spurious interrupts, * and, unlike other attempts at IRQ probing, they do not get hung on * stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards). * * For reasonably foolproof probing, use them as follows: * * 1. clear and/or mask the device's internal interrupt. * 2. sti(); * 3. irqs = probe_irq_on(); // "take over" all unassigned idle IRQs * 4. enable the device and cause it to trigger an interrupt. * 5. wait for the device to interrupt, using non-intrusive polling or a delay. * 6. irq = probe_irq_off(irqs); // get IRQ number, 0=none, negative=multiple * 7. service the device to clear its pending interrupt. * 8. loop again if paranoia is required. * * probe_irq_on() returns a mask of allocated irq's. * * probe_irq_off() takes the mask as a parameter, * and returns the irq number which occurred, * or zero if none occurred, or a negative irq number * if more than one irq occurred. */ #if !defined(CONFIG_GENERIC_IRQ_PROBE) static inline unsigned long probe_irq_on(void) { return 0; } static inline int probe_irq_off(unsigned long val) { return 0; } static inline unsigned int probe_irq_mask(unsigned long val) { return 0; } #else extern unsigned long probe_irq_on(void); /* returns 0 on failure */ extern int probe_irq_off(unsigned long); /* returns 0 or negative on failure */ extern unsigned int probe_irq_mask(unsigned long); /* returns mask of ISA interrupts */ #endif #ifdef CONFIG_PROC_FS /* Initialize /proc/irq/ */ extern void init_irq_proc(void); #else static inline void init_irq_proc(void) { } #endif #ifdef CONFIG_IRQ_TIMINGS void irq_timings_enable(void); void irq_timings_disable(void); u64 irq_timings_next_event(u64 now); #endif struct seq_file; int show_interrupts(struct seq_file *p, void *v); int arch_show_interrupts(struct seq_file *p, int prec); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); /* * We want to know which function is an entrypoint of a hardirq or a softirq. */ #ifndef __irq_entry # define __irq_entry __section(".irqentry.text") #endif #define __softirq_entry __section(".softirqentry.text") #endif
33 33 31 33 33 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/mutex.h> #include <net/sock.h> #include "nf_internals.h" /* Sockopts only registered and called from user context, so net locking would be overkill. Also, [gs]etsockopt calls may sleep. */ static DEFINE_MUTEX(nf_sockopt_mutex); static LIST_HEAD(nf_sockopts); /* Do exclusive ranges overlap? */ static inline int overlap(int min1, int max1, int min2, int max2) { return max1 > min2 && min1 < max2; } /* Functions to register sockopt ranges (exclusive). */ int nf_register_sockopt(struct nf_sockopt_ops *reg) { struct nf_sockopt_ops *ops; int ret = 0; mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == reg->pf && (overlap(ops->set_optmin, ops->set_optmax, reg->set_optmin, reg->set_optmax) || overlap(ops->get_optmin, ops->get_optmax, reg->get_optmin, reg->get_optmax))) { pr_debug("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", ops->set_optmin, ops->set_optmax, ops->get_optmin, ops->get_optmax, reg->set_optmin, reg->set_optmax, reg->get_optmin, reg->get_optmax); ret = -EBUSY; goto out; } } list_add(&reg->list, &nf_sockopts); out: mutex_unlock(&nf_sockopt_mutex); return ret; } EXPORT_SYMBOL(nf_register_sockopt); void nf_unregister_sockopt(struct nf_sockopt_ops *reg) { mutex_lock(&nf_sockopt_mutex); list_del(&reg->list); mutex_unlock(&nf_sockopt_mutex); } EXPORT_SYMBOL(nf_unregister_sockopt); static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, int val, int get) { struct nf_sockopt_ops *ops; mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == pf) { if (!try_module_get(ops->owner)) goto out_nosup; if (get) { if (val >= ops->get_optmin && val < ops->get_optmax) goto out; } else { if (val >= ops->set_optmin && val < ops->set_optmax) goto out; } module_put(ops->owner); } } out_nosup: ops = ERR_PTR(-ENOPROTOOPT); out: mutex_unlock(&nf_sockopt_mutex); return ops; } int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt, unsigned int len) { struct nf_sockopt_ops *ops; int ret; ops = nf_sockopt_find(sk, pf, val, 0); if (IS_ERR(ops)) return PTR_ERR(ops); ret = ops->set(sk, val, opt, len); module_put(ops->owner); return ret; } EXPORT_SYMBOL(nf_setsockopt); int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { struct nf_sockopt_ops *ops; int ret; ops = nf_sockopt_find(sk, pf, val, 1); if (IS_ERR(ops)) return PTR_ERR(ops); ret = ops->get(sk, val, opt, len); module_put(ops->owner); return ret; } EXPORT_SYMBOL(nf_getsockopt);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0-or-later /* * connector.c * * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net> * All rights reserved. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <linux/moduleparam.h> #include <linux/connector.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector."); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_CONNECTOR); static struct cn_dev cdev; static int cn_already_initialized; /* * Sends mult (multiple) cn_msg at a time. * * msg->seq and msg->ack are used to determine message genealogy. * When someone sends message it puts there locally unique sequence * and random acknowledge numbers. Sequence number may be copied into * nlmsghdr->nlmsg_seq too. * * Sequence number is incremented with each message to be sent. * * If we expect a reply to our message then the sequence number in * received message MUST be the same as in original message, and * acknowledge number MUST be the same + 1. * * If we receive a message and its sequence number is not equal to the * one we are expecting then it is a new message. * * If we receive a message and its sequence number is the same as one * we are expecting but it's acknowledgement number is not equal to * the acknowledgement number in the original message + 1, then it is * a new message. * * If msg->len != len, then additional cn_msg messages are expected following * the first msg. * * The message is sent to, the portid if given, the group if given, both if * both, or if both are zero then the group is looked up and sent there. */ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, gfp_t gfp_mask, netlink_filter_fn filter, void *filter_data) { struct cn_callback_entry *__cbq; unsigned int size; struct sk_buff *skb; struct nlmsghdr *nlh; struct cn_msg *data; struct cn_dev *dev = &cdev; u32 group = 0; int found = 0; if (portid || __group) { group = __group; } else { spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(__cbq, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&__cbq->id.id, &msg->id)) { found = 1; group = __cbq->group; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (!found) return -ENODEV; } if (!portid && !netlink_has_listeners(dev->nls, group)) return -ESRCH; size = sizeof(*msg) + len; skb = nlmsg_new(size, gfp_mask); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, msg->seq, NLMSG_DONE, size, 0); if (!nlh) { kfree_skb(skb); return -EMSGSIZE; } data = nlmsg_data(nlh); memcpy(data, msg, size); NETLINK_CB(skb).dst_group = group; if (group) return netlink_broadcast_filtered(dev->nls, skb, portid, group, gfp_mask, filter, (void *)filter_data); return netlink_unicast(dev->nls, skb, portid, !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); /* same as cn_netlink_send_mult except msg->len is used for len */ int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, gfp_t gfp_mask) { return cn_netlink_send_mult(msg, msg->len, portid, __group, gfp_mask, NULL, NULL); } EXPORT_SYMBOL_GPL(cn_netlink_send); /* * Callback helper - queues work and setup destructor for given data. */ static int cn_call_callback(struct sk_buff *skb) { struct nlmsghdr *nlh; struct cn_callback_entry *i, *cbq = NULL; struct cn_dev *dev = &cdev; struct cn_msg *msg = nlmsg_data(nlmsg_hdr(skb)); struct netlink_skb_parms *nsp = &NETLINK_CB(skb); int err = -ENODEV; /* verify msg->len is within skb */ nlh = nlmsg_hdr(skb); if (nlh->nlmsg_len < NLMSG_HDRLEN + sizeof(struct cn_msg) + msg->len) return -EINVAL; spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(i, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&i->id.id, &msg->id)) { refcount_inc(&i->refcnt); cbq = i; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (cbq != NULL) { cbq->callback(msg, nsp); kfree_skb(skb); cn_queue_release_callback(cbq); err = 0; } return err; } /* * Allow non-root access for NETLINK_CONNECTOR family having CN_IDX_PROC * multicast group. */ static int cn_bind(struct net *net, int group) { unsigned long groups = (unsigned long) group; if (ns_capable(net->user_ns, CAP_NET_ADMIN)) return 0; if (test_bit(CN_IDX_PROC - 1, &groups)) return 0; return -EPERM; } static void cn_release(struct sock *sk, unsigned long *groups) { if (groups && test_bit(CN_IDX_PROC - 1, groups)) { kfree(sk->sk_user_data); sk->sk_user_data = NULL; } } /* * Main netlink receiving function. * * It checks skb, netlink header and msg sizes, and calls callback helper. */ static void cn_rx_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; int len, err; if (skb->len >= NLMSG_HDRLEN) { nlh = nlmsg_hdr(skb); len = nlmsg_len(nlh); if (len < (int)sizeof(struct cn_msg) || skb->len < nlh->nlmsg_len || len > CONNECTOR_MAX_MSG_SIZE) return; err = cn_call_callback(skb_get(skb)); if (err < 0) kfree_skb(skb); } } /* * Callback add routing - adds callback with given ID and name. * If there is registered callback with the same ID it will not be added. * * May sleep. */ int cn_add_callback(const struct cb_id *id, const char *name, void (*callback)(struct cn_msg *, struct netlink_skb_parms *)) { struct cn_dev *dev = &cdev; if (!cn_already_initialized) return -EAGAIN; return cn_queue_add_callback(dev->cbdev, name, id, callback); } EXPORT_SYMBOL_GPL(cn_add_callback); /* * Callback remove routing - removes callback * with given ID. * If there is no registered callback with given * ID nothing happens. * * May sleep while waiting for reference counter to become zero. */ void cn_del_callback(const struct cb_id *id) { struct cn_dev *dev = &cdev; cn_queue_del_callback(dev->cbdev, id); } EXPORT_SYMBOL_GPL(cn_del_callback); static int __maybe_unused cn_proc_show(struct seq_file *m, void *v) { struct cn_queue_dev *dev = cdev.cbdev; struct cn_callback_entry *cbq; seq_printf(m, "Name ID\n"); spin_lock_bh(&dev->queue_lock); list_for_each_entry(cbq, &dev->queue_list, callback_entry) { seq_printf(m, "%-15s %u:%u\n", cbq->id.name, cbq->id.id.idx, cbq->id.id.val); } spin_unlock_bh(&dev->queue_lock); return 0; } static int cn_init(void) { struct cn_dev *dev = &cdev; struct netlink_kernel_cfg cfg = { .groups = CN_NETLINK_USERS + 0xf, .input = cn_rx_skb, .flags = NL_CFG_F_NONROOT_RECV, .bind = cn_bind, .release = cn_release, }; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg); if (!dev->nls) return -EIO; dev->cbdev = cn_queue_alloc_dev("cqueue", dev->nls); if (!dev->cbdev) { netlink_kernel_release(dev->nls); return -EINVAL; } cn_already_initialized = 1; proc_create_single("connector", S_IRUGO, init_net.proc_net, cn_proc_show); return 0; } static void cn_fini(void) { struct cn_dev *dev = &cdev; cn_already_initialized = 0; remove_proc_entry("connector", init_net.proc_net); cn_queue_free_dev(dev->cbdev); netlink_kernel_release(dev->nls); } subsys_initcall(cn_init); module_exit(cn_fini);
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET An implementation of the SOCKET network access protocol. * This is the master header file for the Linux NET layer, * or, in plain English: the networking handling part of the * kernel. * * Version: @(#)net.h 1.0.3 05/25/93 * * Authors: Orest Zborowski, <obz@Kodak.COM> * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_NET_H #define _LINUX_NET_H #include <linux/stringify.h> #include <linux/random.h> #include <linux/wait.h> #include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */ #include <linux/rcupdate.h> #include <linux/once.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/sockptr.h> #include <uapi/linux/net.h> struct poll_table_struct; struct pipe_inode_info; struct inode; struct file; struct net; /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. * Eventually all flags will be in sk->sk_wq->flags. */ enum socket_flags { SOCKWQ_ASYNC_NOSPACE, SOCKWQ_ASYNC_WAITDATA, SOCK_NOSPACE, SOCK_SUPPORT_ZC, SOCK_CUSTOM_SOCKOPT, }; #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types * @SOCK_STREAM: stream (connection) socket * @SOCK_DGRAM: datagram (conn.less) socket * @SOCK_RAW: raw socket * @SOCK_RDM: reliably-delivered message * @SOCK_SEQPACKET: sequential packet socket * @SOCK_DCCP: Datagram Congestion Control Protocol socket * @SOCK_PACKET: linux specific way of getting packets at the dev level. * For writing rarp and other similar things on the user level. * * When adding some new socket type please * grep ARCH_HAS_SOCKET_TYPE include/asm-* /socket.h, at least MIPS * overrides this enum for binary compat reasons. */ enum sock_type { SOCK_STREAM = 1, SOCK_DGRAM = 2, SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, SOCK_DCCP = 6, SOCK_PACKET = 10, }; #endif /* ARCH_HAS_SOCKET_TYPES */ #define SOCK_MAX (SOCK_PACKET + 1) /* Mask which covers at least up to SOCK_MASK-1. The * remaining bits are used as flags. */ #define SOCK_TYPE_MASK 0xf /* Flags for socket, socketpair, accept4 */ #define SOCK_CLOEXEC O_CLOEXEC #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #define SOCK_COREDUMP O_NOCTTY /** * enum sock_shutdown_cmd - Shutdown types * @SHUT_RD: shutdown receptions * @SHUT_WR: shutdown transmissions * @SHUT_RDWR: shutdown receptions/transmissions */ enum sock_shutdown_cmd { SHUT_RD, SHUT_WR, SHUT_RDWR, }; struct socket_wq { /* Note: wait MUST be first field of socket_wq */ wait_queue_head_t wait; struct fasync_struct *fasync_list; unsigned long flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */ struct rcu_head rcu; } ____cacheline_aligned_in_smp; /** * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) * @type: socket type (%SOCK_STREAM, etc) * @flags: socket flags (%SOCK_NOSPACE, etc) * @ops: protocol specific socket operations * @file: File back pointer for gc * @sk: internal networking protocol agnostic socket representation * @wq: wait queue for several uses */ struct socket { socket_state state; short type; unsigned long flags; struct file *file; struct sock *sk; const struct proto_ops *ops; /* Might change with IPV6_ADDRFORM or MPTCP. */ struct socket_wq wq; }; /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet * have multiple different users of the data that * we read from a file. * * The simplest case just copies the data to user * mode. */ typedef struct { size_t written; size_t count; union { char __user *buf; void *data; } arg; int error; } read_descriptor_t; struct vm_area_struct; struct page; struct msghdr; struct module; struct sk_buff; struct proto_accept_arg; typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, unsigned int, size_t); typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *); struct proto_ops { int family; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, struct sockaddr_unsized *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, struct sockaddr_unsized *vaddr, int sockaddr_len, int flags); int (*socketpair)(struct socket *sock1, struct socket *sock2); int (*accept) (struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); #endif int (*gettstamp) (struct socket *sock, void __user *userstamp, bool timeval, bool time32); int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); void (*show_fdinfo)(struct seq_file *m, struct socket *sock); int (*sendmsg) (struct socket *sock, struct msghdr *m, size_t total_len); /* Notes for implementing recvmsg: * =============================== * msg->msg_namelen should get updated by the recvmsg handlers * iff msg_name != NULL. It is by default 0 to prevent * returning uninitialized memory to user space. The recvfrom * handlers can assume that msg.msg_name is either NULL or has * a minimum size of sizeof(struct sockaddr_storage). */ int (*recvmsg) (struct socket *sock, struct msghdr *m, size_t total_len, int flags); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); void (*splice_eof)(struct socket *sock); int (*set_peek_off)(struct sock *sk, int val); int (*peek_len)(struct socket *sock); /* The following functions are called internally by kernel with * sock lock already held. */ int (*read_sock)(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); /* This is different from read_sock(), it reads an entire skb at a time. */ int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); int (*set_rcvlowat)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ type dst = ({ __sockaddr_check_size(sizeof(*dst)); (type) src; }) struct net_proto_family { int family; int (*create)(struct net *net, struct socket *sock, int protocol, int kern); struct module *owner; }; struct iovec; struct kvec; enum { SOCK_WAKE_IO, SOCK_WAKE_WAITD, SOCK_WAKE_SPACE, SOCK_WAKE_URG, }; int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); bool sock_is_registered(int family); int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); int sock_create(int family, int type, int proto, struct socket **res); int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); struct socket *sock_alloc(void); void sock_release(struct socket *sock); int sock_sendmsg(struct socket *sock, struct msghdr *msg); int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags); struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname); struct socket *sockfd_lookup(int fd, int *err); struct socket *sock_from_file(struct file *file); #define sockfd_put(sock) fput(sock->file) int net_ratelimit(void); #define net_ratelimited_function(function, ...) \ do { \ if (net_ratelimit()) \ function(__VA_ARGS__); \ } while (0) #define net_emerg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_emerg, fmt, ##__VA_ARGS__) #define net_alert_ratelimited(fmt, ...) \ net_ratelimited_function(pr_alert, fmt, ##__VA_ARGS__) #define net_crit_ratelimited(fmt, ...) \ net_ratelimited_function(pr_crit, fmt, ##__VA_ARGS__) #define net_err_ratelimited(fmt, ...) \ net_ratelimited_function(pr_err, fmt, ##__VA_ARGS__) #define net_notice_ratelimited(fmt, ...) \ net_ratelimited_function(pr_notice, fmt, ##__VA_ARGS__) #define net_warn_ratelimited(fmt, ...) \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \ net_ratelimit()) \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), \ ##__VA_ARGS__); \ } while (0) #elif defined(DEBUG) #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) #else #define net_dbg_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta * data pages, or other high order pages allocated by * __get_free_pages() without __GFP_COMP, which have a page_count * of 0 and/or have PageSlab() set. We cannot use send_page for * those, as that does get_page(); put_page(); and would cause * either a VM_BUG directly, or __page_cache_release a page that * would actually still be referenced by someone, leading to some * obscure delayed Oops somewhere else. */ static inline bool sendpage_ok(struct page *page) { return !PageSlab(page) && page_count(page) >= 1; } /* * Check sendpage_ok on contiguous pages. */ static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) { struct page *p = page + (offset >> PAGE_SHIFT); size_t count = 0; while (count < len) { if (!sendpage_ok(p)) return false; p++; count += PAGE_SIZE; } return true; } int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len, int flags); int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen); int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how); /* Routine returns the IP overhead imposed by a (caller-protected) socket. */ u32 kernel_sock_ip_overhead(struct sock *sk); #define MODULE_ALIAS_NETPROTO(proto) \ MODULE_ALIAS("net-pf-" __stringify(proto)) #define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto)) #define MODULE_ALIAS_NET_PF_PROTO_TYPE(pf, proto, type) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto) \ "-type-" __stringify(type)) #define MODULE_ALIAS_NET_PF_PROTO_NAME(pf, proto, name) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto) \ name) #endif /* _LINUX_NET_H */
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct privflags_req_info { struct ethnl_req_info base; }; struct privflags_reply_data { struct ethnl_reply_data base; const char (*priv_flag_names)[ETH_GSTRING_LEN]; unsigned int n_priv_flags; u32 priv_flags; }; #define PRIVFLAGS_REPDATA(__reply_base) \ container_of(__reply_base, struct privflags_reply_data, base) const struct nla_policy ethnl_privflags_get_policy[] = { [ETHTOOL_A_PRIVFLAGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int ethnl_get_priv_flags_info(struct net_device *dev, unsigned int *count, const char (**names)[ETH_GSTRING_LEN]) { const struct ethtool_ops *ops = dev->ethtool_ops; int nflags; nflags = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (nflags < 0) return nflags; if (names) { *names = kcalloc(nflags, ETH_GSTRING_LEN, GFP_KERNEL); if (!*names) return -ENOMEM; ops->get_strings(dev, ETH_SS_PRIV_FLAGS, (u8 *)*names); } /* We can pass more than 32 private flags to userspace via netlink but * we cannot get more with ethtool_ops::get_priv_flags(). Note that we * must not adjust nflags before allocating the space for flag names * as the buffer must be large enough for all flags. */ if (WARN_ONCE(nflags > 32, "device %s reports more than 32 private flags (%d)\n", netdev_name(dev), nflags)) nflags = 32; *count = nflags; return 0; } static int privflags_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const char (*names)[ETH_GSTRING_LEN]; const struct ethtool_ops *ops; unsigned int nflags; int ret; ops = dev->ethtool_ops; if (!ops->get_priv_flags || !ops->get_sset_count || !ops->get_strings) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = ethnl_get_priv_flags_info(dev, &nflags, &names); if (ret < 0) goto out_ops; data->priv_flags = ops->get_priv_flags(dev); data->priv_flag_names = names; data->n_priv_flags = nflags; out_ops: ethnl_ops_complete(dev); return ret; } static int privflags_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); return ethnl_bitset32_size(&data->priv_flags, &all_flags, data->n_priv_flags, data->priv_flag_names, compact); } static int privflags_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); return ethnl_put_bitset32(skb, ETHTOOL_A_PRIVFLAGS_FLAGS, &data->priv_flags, &all_flags, data->n_priv_flags, data->priv_flag_names, compact); } static void privflags_cleanup_data(struct ethnl_reply_data *reply_data) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_data); kfree(data->priv_flag_names); } /* PRIVFLAGS_SET */ const struct nla_policy ethnl_privflags_set_policy[] = { [ETHTOOL_A_PRIVFLAGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED }, }; static int ethnl_set_privflags_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!info->attrs[ETHTOOL_A_PRIVFLAGS_FLAGS]) return -EINVAL; if (!ops->get_priv_flags || !ops->set_priv_flags || !ops->get_sset_count || !ops->get_strings) return -EOPNOTSUPP; return 1; } static int ethnl_set_privflags(struct ethnl_req_info *req_info, struct genl_info *info) { const char (*names)[ETH_GSTRING_LEN] = NULL; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; unsigned int nflags; bool mod = false; bool compact; u32 flags; int ret; ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact); if (ret < 0) return ret; ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names); if (ret < 0) return ret; flags = dev->ethtool_ops->get_priv_flags(dev); ret = ethnl_update_bitset32(&flags, nflags, tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names, info->extack, &mod); if (ret < 0 || !mod) goto out_free; ret = dev->ethtool_ops->set_priv_flags(dev, flags); if (ret < 0) goto out_free; ret = 1; out_free: kfree(names); return ret; } const struct ethnl_request_ops ethnl_privflags_request_ops = { .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, .req_info_size = sizeof(struct privflags_req_info), .reply_data_size = sizeof(struct privflags_reply_data), .prepare_data = privflags_prepare_data, .reply_size = privflags_reply_size, .fill_reply = privflags_fill_reply, .cleanup_data = privflags_cleanup_data, .set_validate = ethnl_set_privflags_validate, .set = ethnl_set_privflags, .set_ntf_cmd = ETHTOOL_MSG_PRIVFLAGS_NTF, };
1237 1237 1236 1237 1238 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Resilient Queued Spin Lock * * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. * * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> */ #ifndef __ASM_GENERIC_RQSPINLOCK_H #define __ASM_GENERIC_RQSPINLOCK_H #include <linux/types.h> #include <vdso/time64.h> #include <linux/percpu.h> #ifdef CONFIG_QUEUED_SPINLOCKS #include <asm/qspinlock.h> #endif struct rqspinlock { union { atomic_t val; u32 locked; }; }; /* Even though this is same as struct rqspinlock, we need to emit a distinct * type in BTF for BPF programs. */ struct bpf_res_spin_lock { u32 val; }; struct qspinlock; #ifdef CONFIG_QUEUED_SPINLOCKS typedef struct qspinlock rqspinlock_t; #else typedef struct rqspinlock rqspinlock_t; #endif extern int resilient_tas_spin_lock(rqspinlock_t *lock); #ifdef CONFIG_QUEUED_SPINLOCKS extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); #endif #ifndef resilient_virt_spin_lock_enabled static __always_inline bool resilient_virt_spin_lock_enabled(void) { return false; } #endif #ifndef resilient_virt_spin_lock static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) { return 0; } #endif /* * Default timeout for waiting loops is 0.25 seconds */ #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) /* * Choose 31 as it makes rqspinlock_held cacheline-aligned. */ #define RES_NR_HELD 31 struct rqspinlock_held { int cnt; void *locks[RES_NR_HELD]; }; DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); static __always_inline void grab_held_lock_entry(void *lock) { int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); if (unlikely(cnt > RES_NR_HELD)) { /* Still keep the inc so we decrement later. */ return; } /* * Implied compiler barrier in per-CPU operations; otherwise we can have * the compiler reorder inc with write to table, allowing interrupts to * overwrite and erase our write to the table (as on interrupt exit it * will be reset to NULL). * * It is fine for cnt inc to be reordered wrt remote readers though, * they won't observe our entry until the cnt update is visible, that's * all. */ this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); } /* * We simply don't support out-of-order unlocks, and keep the logic simple here. * The verifier prevents BPF programs from unlocking out-of-order, and the same * holds for in-kernel users. * * It is possible to run into misdetection scenarios of AA deadlocks on the same * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct * logic to preserve right entries in the table would be to walk the array of * held locks and swap and clear out-of-order entries, but that's too * complicated and we don't have a compelling use case for out of order unlocking. */ static __always_inline void release_held_lock_entry(void) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); if (unlikely(rqh->cnt > RES_NR_HELD)) goto dec; WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); dec: /* * Reordering of clearing above with inc and its write in * grab_held_lock_entry that came before us (in same acquisition * attempt) is ok, we either see a valid entry or NULL when it's * visible. * * But this helper is invoked when we unwind upon failing to acquire the * lock. Unlike the unlock path which constitutes a release store after * we clear the entry, we need to emit a write barrier here. Otherwise, * we may have a situation as follows: * * <error> for lock B * release_held_lock_entry * * grab_held_lock_entry * try_cmpxchg_acquire for lock A * * Lack of any ordering means reordering may occur such that dec, inc * are done before entry is overwritten. This permits a remote lock * holder of lock B (which this CPU failed to acquire) to now observe it * as being attempted on this CPU, and may lead to misdetection (if this * CPU holds a lock it is attempting to acquire, leading to false ABBA * diagnosis). * * The case of unlock is treated differently due to NMI reentrancy, see * comments in res_spin_unlock. * * In theory we don't have a problem if the dec and WRITE_ONCE above get * reordered with each other, we either notice an empty NULL entry on * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which * cannot be observed (if dec precedes WRITE_ONCE). * * Emit the write barrier _before_ the dec, this permits dec-inc * reordering but that is harmless as we'd have new entry set to NULL * already, i.e. they cannot precede the NULL store above. */ smp_wmb(); this_cpu_dec(rqspinlock_held_locks.cnt); } #ifdef CONFIG_QUEUED_SPINLOCKS /** * res_spin_lock - acquire a queued spinlock * @lock: Pointer to queued spinlock structure * * Return: * * 0 - Lock was acquired successfully. * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. * * -ETIMEDOUT - Lock acquisition failed because of timeout. */ static __always_inline int res_spin_lock(rqspinlock_t *lock) { int val = 0; /* * Grab the deadlock detection entry before doing the cmpxchg, so that * reentrancy due to NMIs between the succeeding cmpxchg and creation of * held lock entry can correctly detect an acquisition attempt in the * interrupted context. * * cmpxchg lock A * <NMI> * res_spin_lock(A) --> missed AA, leads to timeout * </NMI> * grab_held_lock_entry(A) */ grab_held_lock_entry(lock); if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) return 0; return resilient_queued_spin_lock_slowpath(lock, val); } #else #define res_spin_lock(lock) resilient_tas_spin_lock(lock) #endif /* CONFIG_QUEUED_SPINLOCKS */ static __always_inline void res_spin_unlock(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); /* * Release barrier, ensures correct ordering. Perform release store * instead of queued_spin_unlock, since we use this function for the TAS * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear * the full 4-byte lockword. * * Perform the smp_store_release before clearing the lock entry so that * NMIs landing in the unlock path can correctly detect AA issues. The * opposite order shown below may lead to missed AA checks: * * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL) * <NMI> * res_spin_lock(A) --> missed AA, leads to timeout * </NMI> * smp_store_release(A->locked, 0) */ smp_store_release(&lock->locked, 0); if (likely(rqh->cnt <= RES_NR_HELD)) WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); this_cpu_dec(rqspinlock_held_locks.cnt); } #ifdef CONFIG_QUEUED_SPINLOCKS #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) #else #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) #endif #define raw_res_spin_lock(lock) \ ({ \ int __ret; \ preempt_disable(); \ __ret = res_spin_lock(lock); \ if (__ret) \ preempt_enable(); \ __ret; \ }) #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) #define raw_res_spin_lock_irqsave(lock, flags) \ ({ \ int __ret; \ local_irq_save(flags); \ __ret = raw_res_spin_lock(lock); \ if (__ret) \ local_irq_restore(flags); \ __ret; \ }) #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) #endif /* __ASM_GENERIC_RQSPINLOCK_H */
50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming); int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); void batadv_tt_global_entry_release(struct kref *ref); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid); void batadv_tt_free(struct batadv_priv *bat_priv); bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid); void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv); bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); int batadv_tt_cache_init(void); void batadv_tt_cache_destroy(void); /** * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ static inline void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { if (!tt_global_entry) return; kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); } #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
15 3 2 2 8 1 2 6 6 6 4 4 4 4 4 4 2 2 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_connmark.c netfilter connmark retriever action * skb mark is over-written * * Copyright (c) 2011 Felix Fietkau <nbd@openwrt.org> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/pkt_cls.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_connmark.h> #include <net/tc_act/tc_connmark.h> #include <net/tc_wrapper.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> static struct tc_action_ops act_connmark_ops; TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct tcf_connmark_info *ca = to_connmark(a); struct tcf_connmark_parms *parms; struct nf_conntrack_zone zone; struct nf_conn *c; int proto; tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); parms = rcu_dereference_bh(ca->parms); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): if (skb->len < sizeof(struct iphdr)) goto out; proto = NFPROTO_IPV4; break; case htons(ETH_P_IPV6): if (skb->len < sizeof(struct ipv6hdr)) goto out; proto = NFPROTO_IPV6; break; default: goto out; } c = nf_ct_get(skb, &ctinfo); if (c) { skb->mark = READ_ONCE(c->mark); goto count; } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, parms->net, &tuple)) goto out; zone.id = parms->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; thash = nf_conntrack_find_get(parms->net, &zone, &tuple); if (!thash) goto out; c = nf_ct_tuplehash_to_ctrack(thash); skb->mark = READ_ONCE(c->mark); nf_ct_put(c); count: /* using overlimits stats to count how many packets marked */ tcf_action_inc_overlimit_qstats(&ca->common); out: return parms->action; } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { [TCA_CONNMARK_PARMS] = { .len = sizeof(struct tc_connmark) }, }; static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); struct tcf_connmark_parms *nparms, *oparms; struct nlattr *tb[TCA_CONNMARK_MAX + 1]; bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_connmark_info *ci; struct tc_connmark *parm; int ret = 0, err; u32 index; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_CONNMARK_MAX, nla, connmark_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_CONNMARK_PARMS]) return -EINVAL; nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) return -ENOMEM; parm = nla_data(tb[TCA_CONNMARK_PARMS]); index = parm->index; ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_connmark_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); err = ret; goto out_free; } ci = to_connmark(*a); nparms->net = net; nparms->zone = parm->zone; ret = ACT_P_CREATED; } else if (ret > 0) { ci = to_connmark(*a); if (bind) { err = ACT_P_BOUND; goto out_free; } if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } nparms->net = rtnl_dereference(ci->parms)->net; nparms->zone = parm->zone; ret = 0; } else { err = ret; goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; nparms->action = parm->action; spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); spin_unlock_bh(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (oparms) kfree_rcu(oparms, rcu); return ret; release_idr: tcf_idr_release(*a, bind); out_free: kfree(nparms); return err; } static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); const struct tcf_connmark_parms *parms; struct tc_connmark opt; struct tcf_t t; memset(&opt, 0, sizeof(opt)); opt.index = ci->tcf_index; opt.refcnt = refcount_read(&ci->tcf_refcnt) - ref; opt.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind; rcu_read_lock(); parms = rcu_dereference(ci->parms); opt.action = parms->action; opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; rcu_read_unlock(); return skb->len; nla_put_failure: rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } static void tcf_connmark_cleanup(struct tc_action *a) { struct tcf_connmark_info *ci = to_connmark(a); struct tcf_connmark_parms *parms; parms = rcu_dereference_protected(ci->parms, 1); if (parms) kfree_rcu(parms, rcu); } static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .id = TCA_ID_CONNMARK, .owner = THIS_MODULE, .act = tcf_connmark_act, .dump = tcf_connmark_dump, .init = tcf_connmark_init, .cleanup = tcf_connmark_cleanup, .size = sizeof(struct tcf_connmark_info), }; MODULE_ALIAS_NET_ACT("connmark"); static __net_init int connmark_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); return tc_action_net_init(net, tn, &act_connmark_ops); } static void __net_exit connmark_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_connmark_ops.net_id); } static struct pernet_operations connmark_net_ops = { .init = connmark_init_net, .exit_batch = connmark_exit_net, .id = &act_connmark_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init connmark_init_module(void) { return tcf_register_action(&act_connmark_ops, &connmark_net_ops); } static void __exit connmark_cleanup_module(void) { tcf_unregister_action(&act_connmark_ops, &connmark_net_ops); } module_init(connmark_init_module); module_exit(connmark_cleanup_module); MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>"); MODULE_DESCRIPTION("Connection tracking mark restoring"); MODULE_LICENSE("GPL");
29 29 29 29 29 29 29 26 3 29 29 28 29 26 26 26 26 26 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 4 4 4 4 4 4 4 4 4 4 243 243 2 2 2 174 175 127 134 175 49 49 6 42 49 5 3 3 3 3 3 3 49 33 32 33 32 33 33 33 33 33 26 26 26 33 13 13 13 4 13 4 13 13 4 4 13 13 13 4 4 4 4 13 13 13 4 13 4 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 72 4 68 72 72 72 72 1 72 72 2 72 72 72 72 72 72 72 72 87 87 87 87 78 87 77 85 85 3 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/filemap.c * * Copyright (C) 1994-1999 Linus Torvalds */ /* * This file handles the generic file mmap semantics used by * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ #include <linux/export.h> #include <linux/compiler.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/error-injection.h> #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include <linux/delayacct.h> #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> #include <linux/sysctl.h> #include <linux/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/filemap.h> /* * FIXME: remove all knowledge of the buffer layer from the core VM */ #include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> #include "swap.h" /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. * * Shared mappings now work. 15.8.1995 Bruno. * * finished 'unifying' the page and buffer cache and SMP-threaded the * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> * * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> */ /* * Lock ordering: * * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->block_dirty_folio) * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * * ->i_rwsem * ->invalidate_lock (acquired by fs in truncate path) * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_lock * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_lock * ->invalidate_lock (filemap_fault) * ->lock_page (filemap_fault, access_process_vm) * * ->i_rwsem (generic_perform_write) * ->mmap_lock (fault_in_readable->do_page_fault) * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) * * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { XA_STATE(xas, &mapping->i_pages, folio->index); long nr = 1; mapping_set_update(&xas, mapping); xas_set_order(&xas, folio->index, folio_order(folio)); nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); folio->mapping = NULL; /* Leave folio->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } static void filemap_unaccount_folio(struct address_space *mapping, struct folio *folio) { long nr; VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", current->comm, folio_pfn(folio)); dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's * a good bet that actually the page is unmapped * and we'd rather not leak it: if we're wrong, * another bad page check should catch it later. */ atomic_set(&folio->_mapcount, -1); folio_ref_sub(folio, mapcount); } } } /* hugetlb folios do not participate in page cache accounting. */ if (folio_test_hugetlb(folio)) return; nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) mod_node_page_state(folio_pgdat(folio), NR_KERNEL_FILE_PAGES, -nr); /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of * unwritten data - on ordinary filesystems. * * But it's harmless on in-memory filesystems like tmpfs; and can * occur when a driver which did get_user_pages() sets page dirty * before putting it, while the inode is being finally evicted. * * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ if (WARN_ON_ONCE(folio_test_dirty(folio) && mapping_can_writeback(mapping))) folio_account_cleaned(folio, inode_to_wb(mapping->host)); } /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ void __filemap_remove_folio(struct folio *folio, void *shadow) { struct address_space *mapping = folio->mapping; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); page_cache_delete(mapping, folio, shadow); } void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); folio_put_refs(folio, folio_nr_pages(folio)); } /** * filemap_remove_folio - Remove folio from page cache. * @folio: The folio. * * This must be called only on folios that are locked and have been * verified to be in the page cache. It will never put the folio into * the free list because the caller has a reference on the page. */ void filemap_remove_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); } /* * page_cache_delete_batch - delete several folios from page cache * @mapping: the mapping to which folios belong * @fbatch: batch of folios to delete * * The function walks over mapping->i_pages and removes folios passed in * @fbatch from the mapping. The function expects @fbatch to be sorted * by page index and is optimised for it to be dense. * It tolerates holes in @fbatch (mapping entries at those indices are not * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); long total_pages = 0; int i = 0; struct folio *folio; mapping_set_update(&xas, mapping); xas_for_each(&xas, folio, ULONG_MAX) { if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our * pages locked so they are protected from being removed. * If we see a page whose index is higher than ours, it * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ if (folio != fbatch->folios[i]) { VM_BUG_ON_FOLIO(folio->index > fbatch->folios[i]->index, folio); continue; } WARN_ON_ONCE(!folio_test_locked(folio)); folio->mapping = NULL; /* Leave folio->index set: truncation lookup relies on it */ i++; xas_store(&xas, NULL); total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch) { int i; if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); } page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ if (test_bit(AS_ENOSPC, &mapping->flags) && test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; if (test_bit(AS_EIO, &mapping->flags) && test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } EXPORT_SYMBOL(filemap_check_errors); static int filemap_check_and_keep_errors(struct address_space *mapping) { /* Check for outstanding write errors */ if (test_bit(AS_EIO, &mapping->flags)) return -EIO; if (test_bit(AS_ENOSPC, &mapping->flags)) return -ENOSPC; return 0; } static int filemap_writeback(struct address_space *mapping, loff_t start, loff_t end, enum writeback_sync_modes sync_mode, long *nr_to_write) { struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, .range_start = start, .range_end = end, }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(&wbc, mapping->host); ret = do_writepages(mapping, &wbc); wbc_detach_inode(&wbc); if (!ret && nr_to_write) *nr_to_write = wbc.nr_to_write; return ret; } /** * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * * This is a data integrity operation that waits upon dirty or in writeback * pages. * * Return: %0 on success, negative error code otherwise. */ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } EXPORT_SYMBOL(filemap_fdatawrite_range); int filemap_fdatawrite(struct address_space *mapping) { return filemap_fdatawrite_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_fdatawrite); /** * filemap_flush_range - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback * * This is a non-integrity writeback helper, to start writing back folios * for the indicated range. * * Return: %0 on success, negative error code otherwise. */ int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end) { return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. * * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); /* * Start writeback on @nr_to_write pages from @mapping. No one but the existing * btrfs caller should be using this. Talk to linux-mm if you think adding a * new caller is a good idea. */ int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) { return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, nr_to_write); } EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. * * Return: %true if at least one page exists in the specified range, * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; rcu_read_lock(); for (;;) { folio = xas_find(&xas, max); if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to * release the RCU lock anyway. It is enough to know that * there was a page here recently. */ break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); static void __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; struct folio_batch fbatch; unsigned nr_folios; folio_batch_init(&fbatch); while (index <= end) { unsigned i; nr_folios = filemap_get_folios_tag(mapping, &index, end, PAGECACHE_TAG_WRITEBACK, &fbatch); if (!nr_folios) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); } folio_batch_release(&fbatch); cond_resched(); } } /** * filemap_fdatawait_range - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space * in the given range and wait for all of them. Check error status of * the address space and return it. * * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait_range_keep_errors - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space in the * given range and wait for all of them. Unlike filemap_fdatawait_range(), * this function does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) */ int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); /** * file_fdatawait_range - wait for writeback to complete * @file: file pointing to address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the address space that file * refers to, in the given range and wait for all of them. Check error * status of the address space vs. the file->f_wb_err cursor and return it. * * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { struct address_space *mapping = file->f_mapping; __filemap_fdatawait_range(mapping, start_byte, end_byte); return file_check_and_advance_wb_err(file); } EXPORT_SYMBOL(file_fdatawait_range); /** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space * and wait for all of them. Unlike filemap_fdatawait(), this function * does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) * * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { return mapping->nrpages; } bool filemap_range_has_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) continue; if (folio_test_dirty(folio) || folio_test_locked(folio) || folio_test_writeback(folio)) break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { int err = 0, err2; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = filemap_check_errors(mapping); if (!err) err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } EXPORT_SYMBOL(__filemap_set_wb_err); /** * file_check_and_advance_wb_err - report wb error (if any) that was previously * and advance wb_err to current one * @file: struct file on which the error is being reported * * When userland calls fsync (or something like nfsd does the equivalent), we * want to report any writeback errors that occurred since the last fsync (or * since the file was opened if there haven't been any). * * Grab the wb_err from the mapping. If it matches what we have in the file, * then just quickly return 0. The file is all caught up. * * If it doesn't match, then take the mapping value, set the "seen" flag in * it and try to swap it into place. If it works, or another task beat us * to it with the new value, then update the f_wb_err and return the error * portion. The error at this point must be reported via proper channels * (a'la fsync, or NFS COMMIT operation, etc.). * * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. * * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { int err = 0; errseq_t old = READ_ONCE(file->f_wb_err); struct address_space *mapping = file->f_mapping; /* Locklessly handle the common case where nothing has changed */ if (errseq_check(&mapping->wb_err, old)) { /* Something changed, must use slow path */ spin_lock(&file->f_lock); old = file->f_wb_err; err = errseq_check_and_advance(&mapping->wb_err, &file->f_wb_err); trace_file_check_and_advance_wb_err(file, old); spin_unlock(&file->f_lock); } /* * We're mostly using this function as a drop in replacement for * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect * that the legacy code would have had on these flags. */ clear_bit(AS_EIO, &mapping->flags); clear_bit(AS_ENOSPC, &mapping->flags); return err; } EXPORT_SYMBOL(file_check_and_advance_wb_err); /** * file_write_and_wait_range - write out & wait on a file range * @file: file pointing to address_space with pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. * * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { int err = 0, err2; struct address_space *mapping = file->f_mapping; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = file_check_and_advance_wb_err(file); if (!err) err = err2; return err; } EXPORT_SYMBOL(file_write_and_wait_range); /** * replace_page_cache_folio - replace a pagecache folio with a new one * @old: folio to be replaced * @new: folio to replace with * * This function replaces a folio in the pagecache with a new one. On * success it acquires the pagecache reference for the new folio and * drops it for the old folio. Both the old and new folios must be * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ void replace_page_cache_folio(struct folio *old, struct folio *new) { struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); VM_BUG_ON_FOLIO(new->mapping, new); folio_get(new); new->mapping = mapping; new->index = offset; mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); folio_put(old); } EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); bool huge; long nr; unsigned int forder = folio_order(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = xas.xa_index; for (;;) { int order = -1; void *entry, *old = NULL; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; if (!xa_is_value(entry)) { xas_set_err(&xas, -EEXIST); goto unlock; } /* * If a larger entry exists, * it will be the first and only entry iterated. */ if (order == -1) order = xas_get_order(&xas); } if (old) { if (order > 0 && order > forder) { unsigned int split_order = max(forder, xas_try_split_min_order(order)); /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); while (order > forder) { xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, order); if (xas_error(&xas)) goto unlock; order = split_order; split_order = max(xas_try_split_min_order( split_order), forder); } xas_reset(&xas); } if (shadowp) *shadowp = old; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; mapping->nrpages += nr; /* hugetlb pages do not participate in page cache accounting */ if (!huge) { lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) goto error; trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; /* Leave folio->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; struct mem_cgroup *tmp; bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags); if (kernel_file) tmp = set_active_memcg(root_mem_cgroup); ret = mem_cgroup_charge(folio, NULL, gfp); if (kernel_file) set_active_memcg(tmp); if (ret) return ret; __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) { mem_cgroup_uncharge(folio); __folio_clear_locked(folio); } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like * any other repeatedly accessed folio. * The exception is folios getting rewritten; evicting other * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ WARN_ON_ONCE(folio_test_active(folio)); if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); if (kernel_file) mod_node_page_state(folio_pgdat(folio), NR_KERNEL_FILE_PAGES, folio_nr_pages(folio)); } return ret; } EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, struct mempolicy *policy) { int n; struct folio *folio; if (policy) return folio_alloc_mpol_noprof(gfp, order, policy, NO_INTERLEAVE_INDEX, numa_node_id()); if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; } return folio_alloc_noprof(gfp, order); } EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* * filemap_invalidate_lock_two - lock invalidate_lock for two mappings * * Lock exclusively invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to lock * @mapping2: the second mapping to lock */ void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1 > mapping2) swap(mapping1, mapping2); if (mapping1) down_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) down_write_nested(&mapping2->invalidate_lock, 1); } EXPORT_SYMBOL(filemap_invalidate_lock_two); /* * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings * * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to unlock * @mapping2: the second mapping to unlock */ void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1) up_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) up_write(&mapping2->invalidate_lock); } EXPORT_SYMBOL(filemap_invalidate_unlock_two); /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of * waitqueues where the bucket discipline is to maintain all * waiters on the same queue and wake all when any of the pages * become available, and for the woken contexts to check to be * sure the appropriate page became available, this saves space * at a cost of "thundering herd" phenomena during rare hash * collisions. */ #define PAGE_WAIT_TABLE_BITS 8 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; static wait_queue_head_t *folio_waitqueue(struct folio *folio) { return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } /* How many times do we accept lock stealing from under a waiter? */ static int sysctl_page_lock_unfairness = 5; static const struct ctl_table filemap_sysctl_table[] = { { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, .maxlen = sizeof(sysctl_page_lock_unfairness), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, } }; void __init pagecache_init(void) { int i; for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); register_sysctl_init("vm", filemap_sysctl_table); } /* * The page wait code treats the "wait->flags" somewhat unusually, because * we have multiple different kinds of waits, not just the usual "exclusive" * one. * * We have: * * (a) no special bits set: * * We're just waiting for the bit to be released, and when a waker * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, * and remove it from the wait queue. * * Simple and straightforward. * * (b) WQ_FLAG_EXCLUSIVE: * * The waiter is waiting to get the lock, and only one waiter should * be woken up to avoid any thundering herd behavior. We'll set the * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. * * This is the traditional exclusive wait. * * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: * * The waiter is waiting to get the bit, and additionally wants the * lock to be transferred to it for fair lock behavior. If the lock * cannot be taken, we stop walking the wait queue without waking * the waiter. * * This is the "fair lock handoff" case, and in addition to setting * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see * that it now has the lock. */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); if (!wake_page_match(wait_page, key)) return 0; /* * If it's a lock handoff wait, we get the bit for it, and * stop walking (and do not wake it up) if we can't. */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } } /* * We are holding the wait-queue lock, but the waiter that * is waiting for this will be checking the flags without * any locking. * * So update the flags atomically, and wake up the waiter * afterwards to avoid any races. This store-release pairs * with the load-acquire in folio_wait_bit_common(). */ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * * Note that this pairs with the "finish_wait()" in the * waiter, and has to be the absolute last thing we do. * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void folio_wake_bit(struct folio *folio, int bit_nr) { wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; spin_lock_irqsave(&q->lock, flags); __wake_up_locked_key(q, TASK_NORMAL, &key); /* * It's possible to miss clearing waiters here, when we woke our page * waiters, but the hashed waitqueue has waiters for other pages on it. * That's okay, it's a rare case. The next waker will clear it. * * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, * other), the flag may be cleared in the course of freeing the page; * but that is not required for correctness. */ if (!waitqueue_active(q) || !key.page_match) folio_clear_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } /* * A choice of three behaviors for folio_wait_bit_common(): */ enum behavior { EXCLUSIVE, /* Hold ref to page and take the bit when woken, like * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, * like folio_put_wait_locked() on PG_locked. */ }; /* * Attempt to check (or get) the folio flag, and mark us done * if successful. */ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { wait_queue_head_t *q = folio_waitqueue(folio); int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; if (bit_nr == PG_locked && !folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = bit_nr; repeat: wait->flags = 0; if (behavior == EXCLUSIVE) { wait->flags = WQ_FLAG_EXCLUSIVE; if (--unfairness < 0) wait->flags |= WQ_FLAG_CUSTOM; } /* * Do one last check whether we can get the * page bit synchronously. * * Do the folio_set_waiters() marking before that * to let any waker we _just_ missed know they * need to wake us up (otherwise they'll never * even go to the slow case that looks at the * page queue), and add ourselves to the wait * queue if we need to sleep. * * This part needs to be done under the queue * lock to avoid races. */ spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, bit_nr, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * From now on, all the logic will be based on * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to * see whether the page bit testing has already * been done by the wake function. * * We can drop our reference to the folio. */ if (behavior == DROP) folio_put(folio); /* * Note that until the "finish_wait()", or until * we see the WQ_FLAG_WOKEN flag, we need to * be very careful with the 'wait->flags', because * we may race with a waker that sets them. */ for (;;) { unsigned int flags; set_current_state(state); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(state, current)) break; io_schedule(); continue; } /* If we were non-exclusive, we're done */ if (behavior != EXCLUSIVE) break; /* If the waker got the lock for us, we're done */ if (flags & WQ_FLAG_DONE) break; /* * Otherwise, if we're getting the lock, we need to * try to get it ourselves. * * And if that fails, we'll have to retry this all. */ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) goto repeat; wait->flags |= WQ_FLAG_DONE; break; } /* * If a signal happened, this 'finish_wait()' may remove the last * waiter from the wait-queues, but the folio waiters bit will remain * set. That's ok. The next wakeup will take care of it, and trying * to do it here would be difficult and prone to races. */ finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } /* * NOTE! The wait->flags weren't stable until we've done the * 'finish_wait()', and we could have exited the loop above due * to a signal, and had a wakeup event happen after the signal * test but before the 'finish_wait()'. * * So only after the finish_wait() can we reliably determine * if we got woken up or not, so we can now figure out the final * return value based on that state without races. * * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive * waiter, but an exclusive one requires WQ_FLAG_DONE. */ if (behavior == EXCLUSIVE) return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } #ifdef CONFIG_MIGRATION /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. * * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = softleaf_to_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = PG_locked; wait->flags = 0; spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, PG_locked, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * If a migration entry exists for the page the migration path must hold * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ spin_unlock(ptl); for (;;) { unsigned int flags; set_current_state(TASK_UNINTERRUPTIBLE); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) break; io_schedule(); continue; } break; } finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } } #endif void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit); int folio_wait_bit_killable(struct folio *folio, int bit_nr) { return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit_killable); /** * folio_put_wait_locked - Drop a reference and wait for it to be unlocked * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not * dereference @folio. * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** * folio_unlock - Unlock a locked folio. * @folio: The folio. * * Unlocks the folio and wakes up any thread sleeping on the page lock. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_unlock(struct folio *folio) { /* Bit 7 allows x86 to check the byte's sign bit */ BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); /** * folio_end_read - End read on a folio. * @folio: The folio. * @success: True if all reads completed successfully. * * When all reads against a folio have completed, filesystems should * call this function to let the pagecache know that no more reads * are outstanding. This will unlock the folio and wake up any thread * sleeping on the lock. The folio will also be marked uptodate if all * reads succeeded. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_end_read(struct folio *folio, bool success) { unsigned long mask = 1 << PG_locked; /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; if (folio_xor_flags_has_waiters(folio, mask)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_end_read); /** * folio_end_private_2 - Clear PG_private_2 and wake any waiters. * @folio: The folio. * * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for * it. The folio reference held for PG_private_2 being set is released. * * This is, for example, used when a netfs folio is being written to a local * disk cache, thereby allowing writes to the cache for the same folio to be * serialised. */ void folio_end_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); folio_wake_bit(folio, PG_private_2); folio_put(folio); } EXPORT_SYMBOL(folio_end_private_2); /** * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { while (folio_test_private_2(folio)) folio_wait_bit(folio, PG_private_2); } EXPORT_SYMBOL(folio_wait_private_2); /** * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is * received by the calling task. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ int folio_wait_private_2_killable(struct folio *folio) { int ret = 0; while (folio_test_private_2(folio)) { ret = folio_wait_bit_killable(folio, PG_private_2); if (ret < 0) break; } return ret; } EXPORT_SYMBOL(folio_wait_private_2_killable); static void filemap_end_dropbehind(struct folio *folio) { struct address_space *mapping = folio->mapping; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) return; if (!folio_test_clear_dropbehind(folio)) return; if (mapping) folio_unmap_invalidate(mapping, folio, 0); } /* * If folio was marked as dropbehind, then pages should be dropped when writeback * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. */ void folio_end_dropbehind(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; /* * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, * but can happen if normal writeback just happens to find dirty folios * that were created as part of uncached writeback, and that writeback * would otherwise not need non-IRQ handling. Just skip the * invalidation in that case. */ if (in_task() && folio_trylock(folio)) { filemap_end_dropbehind(folio); folio_unlock(folio); } } EXPORT_SYMBOL_GPL(folio_end_dropbehind); /** * folio_end_writeback_no_dropbehind - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. * This call is intended for filesystems that need to defer dropbehind. * * Context: May be called from process or interrupt context. */ void folio_end_writeback_no_dropbehind(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* * folio_test_clear_reclaim() could be used here but it is an * atomic operation and overkill in this particular case. Failing * to shuffle a folio marked for immediate reclaim is too mild * a gain to justify taking an atomic operation penalty at the * end of every folio writeback. */ if (folio_test_reclaim(folio)) { folio_clear_reclaim(folio); folio_rotate_reclaimable(folio); } if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); } EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind); /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. * * Context: May be called from process or interrupt context. */ void folio_end_writeback(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. * But here we must make sure that the folio is not freed and * reused before the folio_wake_bit(). */ folio_get(folio); folio_end_writeback_no_dropbehind(folio); folio_end_dropbehind(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock */ void __folio_lock(struct folio *folio) { folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); } EXPORT_SYMBOL(__folio_lock); int __folio_lock_killable(struct folio *folio) { return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, EXCLUSIVE); } EXPORT_SYMBOL_GPL(__folio_lock_killable); static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { struct wait_queue_head *q = folio_waitqueue(folio); int ret; wait->folio = folio; wait->bit_nr = PG_locked; spin_lock_irq(&q->lock); __add_wait_queue_entry_tail(q, &wait->wait); folio_set_waiters(folio); ret = !folio_trylock(folio); /* * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's * safe to remove and return success, we know the callback * isn't going to trigger. */ if (!ret) __remove_wait_queue(q, &wait->wait); else ret = -EIOCBQUEUED; spin_unlock_irq(&q->lock); return ret; } /* * Return values: * 0 - folio is locked. * non-zero - folio is not locked. * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. */ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { unsigned int flags = vmf->flags; if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_lock/per-VMA lock is not * released even though returning VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return VM_FAULT_RETRY; release_fault_lock(vmf); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else folio_wait_locked(folio); return VM_FAULT_RETRY; } if (flags & FAULT_FLAG_KILLABLE) { bool ret; ret = __folio_lock_killable(folio); if (ret) { release_fault_lock(vmf); return VM_FAULT_RETRY; } } else { __folio_lock(folio); } return 0; } /** * page_cache_next_miss() - Find the next gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the * gap with the lowest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 5, then subsequently a gap is * created at index 10, page_cache_next_miss covering both indices may * return 10 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); unsigned long nr = max_scan; while (nr--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; if (xas.xa_index == 0) return 0; } return index + max_scan; } EXPORT_SYMBOL(page_cache_next_miss); /** * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [max(index - max_scan + 1, 0), index] for the * gap with the highest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 10, then subsequently a gap is * created at index 5, page_cache_prev_miss() covering both indices may * return 5 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) break; if (xas.xa_index == ULONG_MAX) break; } return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); /* * Lockless page cache protocol: * On the lookup side: * 1. Load the folio from i_pages * 2. Increment the refcount if it's not zero * 3. If the folio is not found by xas_reload(), put the refcount and retry * * On the removal side: * A. Freeze the page (by zeroing the refcount if nobody else has a reference) * B. Remove the page from i_pages * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by * folio_put(). */ /* * filemap_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * * Looks up the page cache entry at @mapping & @index. If it is a folio, * it is returned with an increased refcount. If it is a shadow entry * of a previously evicted folio, or a swap entry from shmem/tmpfs, * it is returned without further action. * * Return: The folio, swap or shadow entry, %NULL if nothing is found. */ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); repeat: xas_reset(&xas); folio = xas_load(&xas); if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ if (!folio || xa_is_value(folio)) goto out; if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); goto repeat; } out: rcu_read_unlock(); return folio; } /** * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * * If this function returns a folio, it is returned with an increased refcount. * * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio_mpol(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; repeat: folio = filemap_get_entry(mapping, index); if (xa_is_value(folio)) folio = NULL; if (!folio) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EAGAIN); } } else { folio_lock(folio); } /* Has the page been truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); } if (fgp_flags & FGP_ACCESSED) folio_mark_accessed(folio); else if (fgp_flags & FGP_WRITE) { /* Clear idle flag for buffer write */ if (folio_test_idle(folio)) folio_clear_idle(folio); } if (fgp_flags & FGP_STABLE) folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { unsigned int min_order = mapping_min_folio_order(mapping); unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; if (order > mapping_max_folio_order(mapping)) order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); do { gfp_t alloc_gfp = gfp; err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); if (fgp_flags & FGP_DONTCACHE) __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) break; folio_put(folio); folio = NULL; } while (order-- > min_order); if (err == -EEXIST) goto repeat; if (err) { /* * When NOWAIT I/O fails to allocate folios this could * be due to a nonblocking memory allocation and not * because the system actually is out of memory. * Return -EAGAIN so that there caller retries in a * blocking fashion instead of propagating -ENOMEM * to the application. */ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) err = -EAGAIN; return ERR_PTR(err); } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. */ if (folio && (fgp_flags & FGP_FOR_MMAP)) folio_unlock(folio); } if (!folio) return ERR_PTR(-ENOENT); /* not an uncached lookup, clear uncached if set */ if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { struct folio *folio; retry: if (mark == XA_PRESENT) folio = xas_find(xas, max); else folio = xas_find_marked(xas, max, mark); if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ if (!folio || xa_is_value(folio)) return folio; if (!folio_try_get(folio)) goto reset; if (unlikely(folio != xas_reload(xas))) { folio_put(folio); goto reset; } return folio; reset: xas_reset(xas); goto retry; } /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in * the mapping. The entries are placed in @fbatch. find_get_entries() * takes a reference on any actual folios it returns. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries or large folios. * * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; } if (folio_batch_count(fbatch)) { unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); else nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); *start = round_down(indices[idx] + nr, nr); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * find_lock_entries - Find a batch of pagecache entries. * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. * Swap, shadow and DAX entries are included. Folios are returned * locked and with an incremented refcount. Folios which are locked * by somebody else or under writeback are skipped. Folios which are * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries, large folios, folios which could not be * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { unsigned long base; unsigned long nr; if (!xa_is_value(folio)) { nr = folio_nr_pages(folio); base = folio->index; /* Omit large folio which begins before the start */ if (base < *start) goto put; /* Omit large folio which extends beyond the end */ if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; if (folio->mapping != mapping || folio_test_writeback(folio)) goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); } else { nr = 1 << xas_get_order(&xas); base = xas.xa_index & ~(nr - 1); /* Omit order>0 value which begins before the start */ if (base < *start) continue; /* Omit order>0 value which extends beyond the end */ if (base + nr - 1 > end) break; } /* Update start now so that last update is correct on return */ *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; continue; unlock: folio_unlock(folio); put: folio_put(folio); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * filemap_get_folios - Get a batch of folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill. * * Search for and return a batch of folios in the mapping starting at * index @start and up to index @end (inclusive). The folios are returned * in @fbatch with an elevated reference count. * * Return: The number of folios which were found. * We also update @start to index the next folio for the traversal. */ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); } EXPORT_SYMBOL(filemap_get_folios); /** * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill * * filemap_get_folios_contig() works exactly like filemap_get_folios(), * except the returned folios are guaranteed to be contiguous. This may * not return all contiguous folios if the batch gets filled up. * * Return: The number of folios found. * Also update @start to be positioned for traversal of the next folio. */ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); unsigned long nr; struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio && xas.xa_index <= end; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) goto update_start; /* If we landed in the middle of a THP, continue at its end. */ if (xa_is_sibling(folio)) goto update_start; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } update_start: nr = folio_batch_count(fbatch); if (nr) { folio = fbatch->folios[nr - 1]; *start = folio_next_index(folio); } out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_contig); /** * filemap_get_folios_tag - Get a batch of folios matching @tag * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @tag: The tag index * @fbatch: The batch to fill * * The first folio may start before @start; if it does, it will contain * @start. The final folio may extend beyond @end; if it does, it will * contain @end. The folios have ascending indices. There may be gaps * between the folios if there are indices which have no folio in the * page cache. If folios are added to or removed from the page cache * while this is running, they may or may not be found by this call. * Only returns folios that are tagged with @tag. * * Return: The number of folios found. * Also update @start to index the next folio for traversal. */ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, tag)) != NULL) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ if (xa_is_value(folio)) continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This * breaks the iteration when there is a page at index -1 but that is * already broke anyway. */ if (end == (pgoff_t)-1) *start = (pgoff_t)-1; else *start = end + 1; out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_tag); /** * filemap_get_folios_dirty - Get a batch of dirty folios * @mapping: The address_space to search * @start: The starting folio index * @end: The final folio index (inclusive) * @fbatch: The batch to fill * * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except * the returned folios are presumed to be dirty or undergoing writeback. Dirty * state is presumed because we don't block on folio lock nor want to miss * folios. Callers that need to can recheck state upon locking the folio. * * This may not return all dirty folios if the batch gets filled up. * * Return: The number of folios found. * Also update @start to be positioned for traversal of the next folio. */ unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { if (xa_is_value(folio)) continue; if (folio_trylock(folio)) { bool clean = !folio_test_dirty(folio) && !folio_test_writeback(folio); folio_unlock(folio); if (clean) { folio_put(folio); continue; } } if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } } /* * We come here when there is no folio beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This * breaks the iteration when there is a folio at index -1 but that is * already broke anyway. */ if (end == (pgoff_t)-1) *start = (pgoff_t)-1; else *start = end + 1; out: rcu_read_unlock(); return folio_batch_count(fbatch); } /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: * * ---R__________________________________________B__________ * ^ reading here ^ bad block(assume 4k) * * read(R) => miss => readahead(R...B) => media error => frustrating retries * => failing the whole request => read(R) => read(R+1) => * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... * * It is going insane. Fix it by quickly scaling down the readahead size. */ static void shrink_readahead_size_eio(struct file_ra_state *ra) { ra->ra_pages /= 4; } /* * filemap_get_read_batch - Get a batch of folios for read * * Get a batch of folios which represent a contiguous range of bytes in * the file. No exceptional entries will be returned. If @index is in * the middle of a folio, the entire folio will be returned. The last * folio in the batch may have the readahead flag set or the uptodate flag * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; if (xas.xa_index > max || xa_is_value(folio)) break; if (xa_is_sibling(folio)) break; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) break; if (!folio_test_uptodate(folio)) break; if (folio_test_readahead(folio)) break; xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); unsigned long pflags; int error; /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); error = filler(file, folio); if (unlikely(workingset)) psi_memstall_leave(&pflags); if (error) return error; error = folio_wait_locked_killable(folio); if (error) return error; if (folio_test_uptodate(folio)) return 0; if (file) shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, loff_t pos, size_t count, struct folio *folio, bool need_uptodate) { if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; } else { pos -= folio_pos(folio); } if (pos == 0 && count >= folio_size(folio)) return false; return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, size_t count, struct folio *folio, bool need_uptodate) { int error; if (iocb->ki_flags & IOCB_NOWAIT) { if (!filemap_invalidate_trylock_shared(mapping)) return -EAGAIN; } else { filemap_invalidate_lock_shared(mapping); } if (!folio_trylock(folio)) { error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); /* * This is where we usually end up waiting for a * previously submitted readahead to finish. */ folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); if (error) goto unlock_mapping; } error = AOP_TRUNCATED_PAGE; if (!folio->mapping) goto unlock; error = 0; if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, need_uptodate)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); goto unlock_mapping; unlock: folio_unlock(folio); unlock_mapping: filemap_invalidate_unlock_shared(mapping); if (error == AOP_TRUNCATED_PAGE) folio_put(folio); return error; } static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock * here assures we cannot instantiate and bring uptodate new * pagecache folios after evicting page cache during truncate * and before actually freeing blocks. Note that we could * release invalidate_lock after inserting the folio into * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, struct address_space *mapping, struct folio *folio, pgoff_t last_index) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_DONTCACHE) ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the folio beyond the end of the read */ last_index = round_up(iocb->ki_pos + count, mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT; retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); if (iocb->ki_flags & IOCB_DONTCACHE) ractl.dropbehind = 1; page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if (folio_batch_count(fbatch) > 1) { err = -EAGAIN; goto err; } err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) { unsigned int shift = folio_shift(folio); return (pos1 >> shift == pos2 >> shift); } static void filemap_end_dropbehind_read(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; if (folio_test_writeback(folio) || folio_test_dirty(folio)) return; if (folio_trylock(folio)) { filemap_end_dropbehind(folio); folio_unlock(folio); } } /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. * @iter: Destination for the data. * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns * a negative error number. */ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos < 0)) return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { cond_resched(); /* * If we've already successfully copied some data, then we * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(mapping); /* * When a read accesses the same folio several times, only * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); size_t copied; if (end_offset < folio_pos(folio)) break; if (i > 0) folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; break; } } put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; filemap_end_dropbehind_read(folio); folio_put(folio); } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); ra->prev_pos = last_pos; return already_read ? already_read : error; } EXPORT_SYMBOL_GPL(filemap_read); int kiocb_write_and_wait(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos; loff_t end = pos + count - 1; if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, pos, end)) return -EAGAIN; return 0; } return filemap_write_and_wait_range(mapping, pos, end); } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); int filemap_invalidate_pages(struct address_space *mapping, loff_t pos, loff_t end, bool nowait) { int ret; if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; } else { ret = filemap_write_and_wait_range(mapping, pos, end); if (ret) return ret; } /* * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; return filemap_invalidate_pages(mapping, iocb->ki_pos, iocb->ki_pos + count - 1, iocb->ki_flags & IOCB_NOWAIT); } EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block * @iter: destination for the data read * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. * * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall * be returned when no data can be read without waiting for I/O requests * to complete; it doesn't prevent readahead. * * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O * requests shall be made for the read or for readahead. When no data * can be read, -EAGAIN shall be returned. When readahead would be * triggered, a partial, possibly empty read shall be returned. * * Return: * * number of bytes copied, even for partial reads * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { size_t count = iov_iter_count(iter); ssize_t retval = 0; if (!count) return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; retval = kiocb_write_and_wait(iocb, count); if (retval < 0) return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); if (retval >= 0) { iocb->ki_pos += retval; count -= retval; } if (retval != -EIOCBQUEUED) iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter * compressed extents, so if there was an error, or if * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ if (retval < 0 || !count || IS_DAX(inode)) return retval; if (iocb->ki_pos >= i_size_read(inode)) return retval; } return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); /* * Splice subpages from a folio into a pipe. */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size) { struct page *page; size_t spliced = 0, offset = offset_in_folio(folio, fpos); page = folio_page(folio, offset / PAGE_SIZE); size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); *buf = (struct pipe_buffer) { .ops = &page_cache_pipe_buf_ops, .page = page, .offset = offset, .len = part, }; folio_get(folio); pipe->head++; page++; spliced += part; offset = 0; } return spliced; } /** * filemap_splice_read - Splice data from a file's pagecache into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function gets folios from a file's pagecache and splices them into the * pipe. Readahead will be called as necessary to fill more folios. This may * be used for blockdevs also. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct folio_batch fbatch; struct kiocb iocb; size_t total_spliced = 0, used, npages; loff_t isize, end_offset; bool writably_mapped; int i, error = 0; if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) return 0; init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); folio_batch_init(&fbatch); do { cond_resched(); if (*ppos >= i_size_read(in->f_mapping->host)) break; iocb.ki_pos = *ppos; error = filemap_get_pages(&iocb, len, &fbatch, true); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) break; end_offset = min_t(loff_t, isize, *ppos + len); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(in->f_mapping); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t n; if (folio_pos(folio) >= end_offset) goto out; folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); n = min_t(loff_t, len, isize - *ppos); n = splice_folio_into_pipe(pipe, folio, *ppos, n); if (!n) goto out; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) goto out; } folio_batch_release(&fbatch); } while (len); out: folio_batch_release(&fbatch); file_accessed(in); return total_spliced ? total_spliced : error; } EXPORT_SYMBOL(filemap_splice_read); static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); folio_lock(folio); if (unlikely(folio->mapping != mapping)) goto unlock; offset = offset_in_folio(folio, start) & ~(bsz - 1); do { if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; start = (start + bsz) & ~((u64)bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: folio_unlock(folio); rcu_read_lock(); return start; } static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } /** * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. * @mapping: Address space to search. * @start: First byte to consider. * @end: Limit of search (exclusive). * @whence: Either SEEK_HOLE or SEEK_DATA. * * If the page cache knows which blocks contain holes and which blocks * contain data, your filesystem can use this function to implement * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are * entirely memory-based such as tmpfs, and filesystems which support * unwritten extents. * * Return: The requested offset on success, or -ENXIO if @whence specifies * SEEK_DATA and there is no data after @start. There is an implicit hole * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start * and @end contain data. */ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; size_t seek_size; if (start < pos) { if (!seek_data) goto unlock; start = pos; } seek_size = seek_folio_size(&xas, folio); pos = round_up((u64)pos + 1, seek_size); start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; if (start >= end) break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(folio)) folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); if (folio && !xa_is_value(folio)) folio_put(folio); if (start > end) return end; return start; } #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * * This works similar to lock_folio_or_retry in that it can drop the * mmap_lock. It differs in that it actually returns the folio locked * if it returns 1 and 0 if it couldn't lock the folio. If we did have * to drop the mmap_lock then fpin will point to the pinned file and * needs to be fput()'ed at a later point. */ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { if (folio_trylock(folio)) return 1; /* * NOTE! This will make us return with VM_FAULT_RETRY, but with * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) return 0; *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* * We didn't have the right flags to drop the * fault lock, but all fault_handlers only check * for fatal signals if we return VM_FAULT_RETRY, * so we need to drop the fault lock here and * return 0 if we don't have a fpin. */ if (*fpin == NULL) release_fault_lock(vmf); return 0; } } else __folio_lock(folio); return 1; } /* * Synchronous readahead happens when we don't even find a page in the page * cache at all. We don't want to perform IO under the mmap sem, so if we have * to drop the mmap sem we return the file that was pinned in order for us to do * that. If we didn't pin a file then we return NULL. The file that is * returned needs to be fput()'ed when we're done with it. */ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; vm_flags_t vm_flags = vmf->vma->vm_flags; bool force_thp_readahead = false; unsigned short mmap_miss; /* Use the readahead code, even if readahead is disabled */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) force_thp_readahead = true; if (!force_thp_readahead) { /* * If we don't want any read-ahead, don't bother. * VM_EXEC case below is already intended for random access. */ if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; if (vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } } if (!(vm_flags & VM_SEQ_READ)) { /* Avoid banging the cache line if not needed */ mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss < MMAP_LOTSAMISS * 10) WRITE_ONCE(ra->mmap_miss, ++mmap_miss); /* * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ if (mmap_miss > MMAP_LOTSAMISS) return fpin; } if (force_thp_readahead) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; /* * Fetch two PMD folios, so we get the chance to actually * readahead, unless we've been told not to. */ if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; ra->order = HPAGE_PMD_ORDER; page_cache_ra_order(&ractl, ra); return fpin; } if (vm_flags & VM_EXEC) { /* * Allow arch to request a preferred minimum folio order for * executable memory. This can often be beneficial to * performance if (e.g.) arm64 can contpte-map the folio. * Executable memory rarely benefits from readahead, due to its * random access nature, so set async_size to 0. * * Limit to the boundaries of the VMA to avoid reading in any * pad that might exist between sections, which would be a waste * of memory. */ struct vm_area_struct *vma = vmf->vma; unsigned long start = vma->vm_pgoff; unsigned long end = start + vma_pages(vma); unsigned long ra_end; ra->order = exec_folio_order(); ra->start = round_down(vmf->pgoff, 1UL << ra->order); ra->start = max(ra->start, start); ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order); ra_end = min(ra_end, end); ra->size = ra_end - ra->start; ra->async_size = 0; } else { /* * mmap read-around */ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra->order = 0; } fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index = ra->start; page_cache_ra_order(&ractl, ra); return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, * so we want to possibly extend the readahead further. We return the file that * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned short mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; /* * If the folio is locked, we're likely racing against another fault. * Don't touch the mmap_miss counter to avoid decreasing it multiple * times for a single folio and break the balance with mmap_miss * increase in do_sync_mmap_readahead(). */ if (likely(!folio_test_locked(folio))) { mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); } if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; pte_t *ptep; /* * We might have COW'ed a pagecache folio and might now have an mlocked * anon folio mapped. The original pagecache folio is not mlocked and * might have been evicted. During a read+clear/modify/write update of * the PTE, such as done in do_numa_page()/change_pte_range(), we * temporarily clear the PTE under PT lock and might detect it here as * "none" when not holding the PT lock. * * Not rechecking the PTE under PT lock could result in an unexpected * major fault in an mlock'ed region. Recheck only for this special * scenario while holding the PT lock, to not degrade non-mlocked * scenarios. Recheck the PTE without PT lock firstly, thereby reducing * the number of times we hold PT lock. */ if (!(vma->vm_flags & VM_LOCKED)) return 0; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { ret = VM_FAULT_NOPAGE; } else { spin_lock(vmf->ptl); if (unlikely(!pte_none(ptep_get(ptep)))) ret = VM_FAULT_NOPAGE; spin_unlock(vmf->ptl); } pte_unmap(ptep); return ret; } /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. * * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; pgoff_t max_idx, index = vmf->pgoff; struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; trace_mm_filemap_fault(mapping, index); /* * Do we have something in the page cache already? */ folio = filemap_get_folio(mapping, index); if (likely(!IS_ERR(folio))) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) fpin = do_async_mmap_readahead(vmf, folio); if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } } else { ret = filemap_fault_recheck_pte_none(vmf); if (unlikely(ret)) return ret; /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; fpin = do_sync_mmap_readahead(vmf); retry_find: /* * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); if (IS_ERR(folio)) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_OOM; } } if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto retry_find; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked folio in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error, * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop * everything, get the invalidate lock and try again. */ if (!mapping_locked) { folio_unlock(folio); folio_put(folio); goto retry_find; } /* * OK, the folio is really not uptodate. This can be because the * VMA has the VM_RAND_READ flag set, or because an error * arose. Let's read it in directly. */ goto page_not_uptodate; } /* * We've made it this far and we had to drop our mmap_lock, now is the * time to return to the upper layer and have it re-find the vma and * redo the fault. */ if (fpin) { folio_unlock(folio); goto out_retry; } if (mapping_locked) filemap_invalidate_unlock_shared(mapping); /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) { folio_unlock(folio); folio_put(folio); return VM_FAULT_SIGBUS; } vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, * because there really aren't any performance issues here * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (fpin) goto out_retry; folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_SIGBUS; out_retry: /* * We dropped the mmap_lock, we need to return to the fault handler to * re-find the vma and come back and find our hopefully still populated * page. */ if (!IS_ERR(folio)) folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) fput(fpin); return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { folio_unlock(folio); folio_put(folio); return true; } if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, folio, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); return true; } } if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); return false; } static struct folio *next_uptodate_folio(struct xa_state *xas, struct address_space *mapping, pgoff_t end_pgoff) { struct folio *folio = xas_next_entry(xas, end_pgoff); unsigned long max_idx; do { if (!folio) return NULL; if (xas_retry(xas, folio)) continue; if (xa_is_value(folio)) continue; if (!folio_try_get(folio)) continue; if (folio_test_locked(folio)) goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; if (!folio_trylock(folio)) goto skip; if (folio->mapping != mapping) goto unlock; if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; return folio; unlock: folio_unlock(folio); skip: folio_put(folio); } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } /* * Map page range [start_page, start_page + nr_pages) of folio. * start_page is gotten from start by folio_page(folio, start) */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned short *mmap_miss, pgoff_t file_end) { struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; unsigned long addr0; /* * Map the large folio fully where possible: * * - The folio is fully within size of the file or belong * to shmem/tmpfs; * - The folio doesn't cross VMA boundary; * - The folio doesn't cross page table boundary; */ addr0 = addr - start * PAGE_SIZE; if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && folio_within_vma(folio, vmf->vma) && (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { vmf->pte -= start; page -= start; addr = addr0; nr_pages = folio_nr_pages(folio); } do { if (PageHWPoison(page + count)) goto skip; /* * If there are too many folios that are recently evicted * in a file, they will probably continue to be evicted. * In such situation, read-ahead is only a waste of IO. * Don't decrease mmap_miss in this scenario to make sure * we can stop read-ahead. */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; continue; skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count - ref_from_caller); ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } count++; page += count; vmf->pte += count; addr += count * PAGE_SIZE; count = 0; } while (--nr_pages > 0); if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count - ref_from_caller); ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; if (ref_from_caller) /* Locked folios cannot get truncated. */ folio_ref_dec(folio); return ret; } static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; if (PageHWPoison(page)) goto out; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) goto out; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; return ret; out: /* Locked folios cannot get truncated. */ folio_ref_dec(folio); return ret; } vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, folio_type; unsigned short mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; end_pgoff = min(end_pgoff, file_end); /* * Do not allow to map with PMD across i_size to preserve * SIGBUS semantics. * * Make an exception for shmem/tmpfs that for long time * intentionally mapped with PMDs across i_size. */ if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) { folio_unlock(folio); folio_put(folio); goto out; } folio_type = mm_counter_file(folio); do { unsigned long end; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, nr_pages, &rss, &mmap_miss, file_end); folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); if (mmap_miss >= mmap_miss_saved) WRITE_ONCE(file->f_ra.mmap_miss, 0); else WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); return ret; } EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will * see the dirty folio and writeprotect it again. */ folio_mark_dirty(folio); folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; } const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, }; /* This is used for a general mmap of a disk file */ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; return 0; } int generic_file_mmap_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); desc->vm_ops = &generic_file_vm_ops; return 0; } /* * This is for filesystems which do not implement ->writepage. */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) { if (is_shared_maywrite(desc->vm_flags)) return -EINVAL; return generic_file_mmap_prepare(desc); } #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } int generic_file_mmap_prepare(struct vm_area_desc *desc) { return -ENOSYS; } int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) { return -ENOSYS; } #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_mmap_prepare); EXPORT_SYMBOL(generic_file_readonly_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap_prepare); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) { struct folio *folio; int err; if (!filler) filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } goto filler; } if (folio_test_uptodate(folio)) goto out; if (!folio_trylock(folio)) { folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); goto repeat; } /* Folio was truncated from mapping */ if (!folio->mapping) { folio_unlock(folio); folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ if (folio_test_uptodate(folio)) { folio_unlock(folio); goto out; } filler: err = filemap_read_folio(file, filler, folio); if (err) { folio_put(folio); if (err == AOP_TRUNCATED_PAGE) goto repeat; return ERR_PTR(err); } out: folio_mark_accessed(folio); return folio; } /** * read_cache_folio - Read into page cache, fill it if needed. * @mapping: The address_space to read from. * @index: The index to read. * @filler: Function to perform the read, or NULL to use aops->read_folio(). * @file: Passed to filler function, may be NULL if not required. * * Read one page into the page cache. If it succeeds, the folio returned * will contain @index, but it may not be the first page of the folio. * * If the filler function returns an error, it will be returned to the * caller. * * Context: May sleep. Expects mapping->invalidate_lock to be held. * Return: An uptodate folio on success, ERR_PTR() on failure. */ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file) { return do_read_cache_folio(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_folio); /** * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. * @mapping: The address_space for the folio. * @index: The index that the allocated folio will contain. * @gfp: The page allocator flags to use if allocating. * * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with * any new memory allocations done using the specified allocation flags. * * The most likely error from this function is EIO, but ENOMEM is * possible and so is EINTR. If ->read_folio returns another error, * that will be returned to the caller. * * The function expects mapping->invalidate_lock to be already held. * * Return: Uptodate folio on success, ERR_PTR() on failure. */ struct folio *mapping_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_folio(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(mapping_read_folio_gfp); static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { struct folio *folio; folio = do_read_cache_folio(mapping, index, filler, file, gfp); if (IS_ERR(folio)) return &folio->page; return folio_file_page(folio, index); } struct page *read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file) { return do_read_cache_page(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. * @mapping: the page's address_space * @index: the page index * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. * * The function expects mapping->invalidate_lock to be already held. * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_page(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; char *path; errseq_set(&filp->f_mapping->wb_err, -EIO); if (__ratelimit(&_rs)) { path = file_path(filp, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, current->comm); } } void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (mapping->nrpages && invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) dio_warn_stale_pagecache(iocb->ki_filp); } ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct address_space *mapping = iocb->ki_filp->f_mapping; size_t write_len = iov_iter_count(from); ssize_t written; /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; return written; } written = mapping->a_ops->direct_IO(iocb, from); /* * Finally, try again to invalidate clean pages which might have been * cached by non-direct readahead, or faulted in by get_user_pages() * if the source of the write was an mmap'ed region of the file * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... * * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break * them by removing it completely. * * Noticeable example is a blkdev_direct_IO(). * * Skip invalidation for async writes or if mapping has no pages. */ if (written > 0) { struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos; kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } iocb->ki_pos = pos; } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); return written; } EXPORT_SYMBOL(generic_file_direct_write); ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; size_t chunk = mapping_max_folio_size(mapping); long status = 0; ssize_t written = 0; do { struct folio *folio; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ void *fsdata = NULL; bytes = iov_iter_count(i); retry: offset = pos & (chunk - 1); bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); if (fatal_signal_pending(current)) { status = -EINTR; break; } status = a_ops->write_begin(iocb, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); /* * Faults here on mmap()s can recurse into arbitrary * filesystem code. Lots of locks are held that can * deadlock. Use an atomic copy to avoid deadlocking * in page fault handling. */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); status = a_ops->write_end(iocb, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) break; } cond_resched(); if (unlikely(status == 0)) { /* * A short copy made ->write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { bytes = copied; goto retry; } /* * 'folio' is now unlocked and faults on it can be * handled. Ensure forward progress by trying to * fault it in now. */ if (fault_in_iov_iter_readable(i, bytes) == bytes) { status = -EFAULT; break; } } else { pos += status; written += status; } } while (iov_iter_count(i)); if (!written) return status; iocb->ki_pos += written; return written; } EXPORT_SYMBOL(generic_perform_write); /** * __generic_file_write_iter - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates * modification times and calls proper subroutines depending on whether we * do direct IO or a standard buffered write. * * It expects i_rwsem to be grabbed unless we work on a block device or similar * object which does not need locking at all. * * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_rwsem. * * Return: * * number of bytes written, even for truncated writes * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; ret = file_remove_privs(file); if (ret) return ret; ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to * holes, for example. For DAX files, a buffered write will * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) return ret; return direct_write_fallback(iocb, from, ret, generic_perform_write(iocb, from)); } return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); /** * generic_file_write_iter - write data to a file * @iocb: IO state structure * @from: iov_iter with data to write * * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_rwsem as needed. * Return: * * negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } EXPORT_SYMBOL(generic_file_write_iter); /** * filemap_release_folio() - Release fs-specific metadata on a folio. * @folio: The folio which the kernel is trying to free. * @gfp: Memory allocation flags (and I/O mode). * * The address_space is trying to release any data attached to a folio * (presumably at folio->private). * * This will also be called if the private_2 flag is set on a page, * indicating that the folio has other metadata associated with it. * * The @gfp argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block * (__GFP_RECLAIM & __GFP_FS). * * Return: %true if the release was successful, otherwise %false. */ bool filemap_release_folio(struct folio *folio, gfp_t gfp) { struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); if (!folio_needs_release(folio)) return true; if (folio_test_writeback(folio)) return false; if (mapping && mapping->a_ops->release_folio) return mapping->a_ops->release_folio(folio, gfp); return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); /** * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache * @inode: The inode to flush * @flush: Set to write back rather than simply invalidate. * @start: First byte to in range. * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start * onwards. * * Invalidate all the folios on an inode that contribute to the specified * range, possibly writing them back first. Whilst the operation is * undertaken, the invalidate lock is held to prevent new folios from being * installed. */ int filemap_invalidate_inode(struct inode *inode, bool flush, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; pgoff_t first = start >> PAGE_SHIFT; pgoff_t last = end >> PAGE_SHIFT; pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; if (!mapping || !mapping->nrpages || end < start) goto out; /* Prevent new folios from being added to the inode. */ filemap_invalidate_lock(mapping); if (!mapping->nrpages) goto unlock; unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ if (flush) filemap_fdatawrite_range(mapping, start, end); /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); unlock: filemap_invalidate_unlock(mapping); out: return filemap_check_errors(mapping); } EXPORT_SYMBOL_GPL(filemap_invalidate_inode); #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping * @mapping: The mapping to compute the statistics for. * @first_index: The starting page cache index. * @last_index: The final page index (inclusive). * @cs: the cachestat struct to write the result to. * * This will query the page cache statistics of a mapping in the * page range of [first_index, last_index] (inclusive). The statistics * queried include: number of dirty pages, number of pages marked for * writeback, and the number of (recently) evicted pages. */ static void filemap_cachestat(struct address_space *mapping, pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) { XA_STATE(xas, &mapping->i_pages, first_index); struct folio *folio; /* Flush stats (and potentially sleep) outside the RCU read section. */ mem_cgroup_flush_stats_ratelimited(NULL); rcu_read_lock(); xas_for_each(&xas, folio, last_index) { int order; unsigned long nr_pages; pgoff_t folio_first_index, folio_last_index; /* * Don't deref the folio. It is not pinned, and might * get freed (and reused) underneath us. * * We *could* pin it, but that would be expensive for * what should be a fast and lightweight syscall. * * Instead, derive all information of interest from * the rcu-protected xarray. */ if (xas_retry(&xas, folio)) continue; order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; /* Folios might straddle the range boundaries, only count covered pages */ if (folio_first_index < first_index) nr_pages -= first_index - folio_first_index; if (folio_last_index > last_index) nr_pages -= folio_last_index - last_index; if (xa_is_value(folio)) { /* page is evicted */ void *shadow = (void *)folio; bool workingset; /* not used */ cs->nr_evicted += nr_pages; #ifdef CONFIG_SWAP /* implies CONFIG_MMU */ if (shmem_mapping(mapping)) { /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ if (!softleaf_is_swap(swp)) goto resched; /* * Getting a swap entry from the shmem * inode means we beat * shmem_unuse(). rcu_read_lock() * ensures swapoff waits for us before * freeing the swapper space. However, * we can race with swapping and * invalidation, so there might not be * a shadow in the swapcache (yet). */ shadow = swap_cache_get_shadow(swp); if (!shadow) goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset, false)) cs->nr_recently_evicted += nr_pages; goto resched; } /* page is in cache */ cs->nr_cache += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY)) cs->nr_dirty += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK)) cs->nr_writeback += nr_pages; resched: if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); } /* * See mincore: reveal pagecache information only for files * that the calling process has write access to, or could (if * tried) open for writing. */ static inline bool can_do_cachestat(struct file *f) { if (f->f_mode & FMODE_WRITE) return true; if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) return true; return file_permission(f, MAY_WRITE) == 0; } /* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the * bytes range specified by `off` and `len`: number of cached pages, * number of dirty pages, number of pages marked for writeback, * number of evicted pages, and number of recently evicted pages. * * An evicted page is a page that is previously in the page cache * but has been evicted since. A page is recently evicted if its last * eviction was recent enough that its reentry to the cache would * indicate that it is actively being used by the system, and that * there is memory pressure on the system. * * `off` and `len` must be non-negative integers. If `len` > 0, * the queried range is [`off`, `off` + `len`]. If `len` == 0, * we will query in the range from `off` to the end of the file. * * The `flags` argument is unused for now, but is included for future * extensibility. User should pass 0 (i.e no flag specified). * * Currently, hugetlbfs is not supported. * * Because the status of a page can change after cachestat() checks it * but before it returns to the application, the returned values may * contain stale information. * * return values: * zero - success * -EFAULT - cstat or cstat_range points to an illegal address * -EINVAL - invalid flags * -EBADF - invalid file descriptor * -EOPNOTSUPP - file descriptor is of a hugetlbfs file */ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { CLASS(fd, f)(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; if (fd_empty(f)) return -EBADF; if (copy_from_user(&csr, cstat_range, sizeof(struct cachestat_range))) return -EFAULT; /* hugetlbfs is not supported */ if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; if (!can_do_cachestat(fd_file(f))) return -EPERM; if (flags != 0) return -EINVAL; first_index = csr.off >> PAGE_SHIFT; last_index = csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; memset(&cs, 0, sizeof(struct cachestat)); mapping = fd_file(f)->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; return 0; } #endif /* CONFIG_CACHESTAT_SYSCALL */
66 20 58 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2021 Facebook */ #ifndef __MMAP_UNLOCK_WORK_H__ #define __MMAP_UNLOCK_WORK_H__ #include <linux/irq_work.h> /* irq_work to run mmap_read_unlock() in irq_work */ struct mmap_unlock_irq_work { struct irq_work irq_work; struct mm_struct *mm; }; DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); /* * We cannot do mmap_read_unlock() when the irq is disabled, because of * risk to deadlock with rq_lock. To look up vma when the irqs are * disabled, we need to run mmap_read_unlock() in irq_work. We use a * percpu variable to do the irq_work. If the irq_work is already used * by another lookup, we fall over. */ static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr) { struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = false; if (irqs_disabled()) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { work = this_cpu_ptr(&mmap_unlock_work); if (irq_work_is_busy(&work->irq_work)) { /* cannot queue more up_read, fallback */ irq_work_busy = true; } } else { /* * PREEMPT_RT does not allow to trylock mmap sem in * interrupt disabled context. Force the fallback code. */ irq_work_busy = true; } } *work_ptr = work; return irq_work_busy; } static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm) { if (!work) { mmap_read_unlock(mm); } else { work->mm = mm; /* The lock will be released once we're out of interrupt * context. Tell lockdep that we've released it now so * it doesn't complain that we forgot to release it. */ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } #endif /* __MMAP_UNLOCK_WORK_H__ */
2 119 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2018-2025 Intel Corporation */ #ifndef IEEE80211_I_H #define IEEE80211_I_H #include <linux/kernel.h> #include <linux/device.h> #include <linux/if_ether.h> #include <linux/interrupt.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/workqueue.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/etherdevice.h> #include <linux/leds.h> #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/rbtree.h> #include <kunit/visibility.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include <net/fq.h> #include "key.h" #include "sta_info.h" #include "debug.h" #include "drop.h" extern const struct cfg80211_ops mac80211_config_ops; struct ieee80211_local; struct ieee80211_mesh_fast_tx; /* Maximum number of broadcast/multicast frames to buffer when some of the * associated stations are using power saving. */ #define AP_MAX_BC_BUFFER 128 /* Maximum number of frames buffered to all STAs, including multicast frames. * Note: increasing this limit increases the potential memory requirement. Each * frame can be up to about 2 kB long. */ #define TOTAL_MAX_TX_BUFFER 512 /* Required encryption head and tailroom */ #define IEEE80211_ENCRYPT_HEADROOM 8 #define IEEE80211_ENCRYPT_TAILROOM 18 /* power level hasn't been configured (or set to automatic) */ #define IEEE80211_UNSET_POWER_LEVEL INT_MIN /* * Some APs experience problems when working with U-APSD. Decreasing the * probability of that happening by using legacy mode for all ACs but VO isn't * enough. * * Cisco 4410N originally forced us to enable VO by default only because it * treated non-VO ACs as legacy. * * However some APs (notably Netgear R7000) silently reclassify packets to * different ACs. Since u-APSD ACs require trigger frames for frame retrieval * clients would never see some frames (e.g. ARP responses) or would fetch them * accidentally after a long time. * * It makes little sense to enable u-APSD queues by default because it needs * userspace applications to be aware of it to actually take advantage of the * possible additional powersavings. Implicitly depending on driver autotrigger * frame support doesn't make much sense. */ #define IEEE80211_DEFAULT_UAPSD_QUEUES 0 #define IEEE80211_DEFAULT_MAX_SP_LEN \ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) #define IEEE80211_MAX_NAN_INSTANCE_ID 255 /* * Current mac80211 implementation supports a maximum of 1600 AIDS * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks * as each block is 64 AIDs. */ #define IEEE80211_MAX_SUPPORTED_S1G_AID 1600 #define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25 enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, IEEE80211_STATUS_TYPE_SMPS = 1, IEEE80211_STATUS_TYPE_NEG_TTLM = 2, IEEE80211_STATUS_SUBDATA_MASK = 0x1ff0, }; static inline bool ieee80211_sta_keep_active(struct sta_info *sta, u8 ac) { /* Keep a station's queues on the active list for deficit accounting * purposes if it was active or queued during the last 100ms. */ return time_before_eq(jiffies, sta->airtime[ac].last_active + HZ / 10); } struct ieee80211_bss { u32 device_ts_beacon, device_ts_presp; bool wmm_used; bool uapsd_supported; #define IEEE80211_MAX_SUPP_RATES 32 u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; size_t supp_rates_len; struct ieee80211_rate *beacon_rate; u32 vht_cap_info; /* * During association, we save an ERP value from a probe response so * that we can feed ERP info to the driver when handling the * association completes. these fields probably won't be up-to-date * otherwise, you probably don't want to use them. */ bool has_erp_value; u8 erp_value; /* Keep track of the corruption of the last beacon/probe response. */ u8 corrupt_data; /* Keep track of what bits of information we have valid info for. */ u8 valid_data; }; /** * enum ieee80211_bss_corrupt_data_flags - BSS data corruption flags * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted * * These are bss flags that are attached to a bss in the * @corrupt_data field of &struct ieee80211_bss. */ enum ieee80211_bss_corrupt_data_flags { IEEE80211_BSS_CORRUPT_BEACON = BIT(0), IEEE80211_BSS_CORRUPT_PROBE_RESP = BIT(1) }; /** * enum ieee80211_bss_valid_data_flags - BSS valid data flags * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE * * These are bss flags that are attached to a bss in the * @valid_data field of &struct ieee80211_bss. They show which parts * of the data structure were received as a result of an un-corrupted * beacon/probe response. */ enum ieee80211_bss_valid_data_flags { IEEE80211_BSS_VALID_WMM = BIT(1), IEEE80211_BSS_VALID_RATES = BIT(2), IEEE80211_BSS_VALID_ERP = BIT(3) }; typedef unsigned __bitwise ieee80211_tx_result; #define TX_CONTINUE ((__force ieee80211_tx_result) 0u) #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) struct ieee80211_tx_data { struct sk_buff *skb; struct sk_buff_head skbs; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_key *key; struct ieee80211_tx_rate rate; unsigned int flags; }; /** * enum ieee80211_packet_rx_flags - packet RX flags * @IEEE80211_RX_AMSDU: a-MSDU packet * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering * * These are per-frame flags that are attached to a frame in the * @rx_flags field of &struct ieee80211_rx_status. */ enum ieee80211_packet_rx_flags { IEEE80211_RX_AMSDU = BIT(3), IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), IEEE80211_RX_DEFERRED_RELEASE = BIT(5), }; /** * enum ieee80211_rx_flags - RX data flags * * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). * * These flags are used across handling multiple interfaces * for a single frame. */ enum ieee80211_rx_flags { IEEE80211_RX_BEACON_REPORTED = BIT(0), }; struct ieee80211_rx_data { struct list_head *list; struct sk_buff *skb; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct ieee80211_link_data *link; struct sta_info *sta; struct link_sta_info *link_sta; struct ieee80211_key *key; unsigned int flags; /* * Index into sequence numbers array, 0..16 * since the last (16) is used for non-QoS, * will be 16 on non-QoS frames. */ int seqno_idx; /* * Index into the security IV/PN arrays, 0..16 * since the last (16) is used for CCMP-encrypted * management frames, will be set to 16 on mgmt * frames and 0 on non-QoS frames. */ int security_idx; int link_id; union { struct { u32 iv32; u16 iv16; } tkip; struct { u8 pn[IEEE80211_CCMP_PN_LEN]; } ccm_gcm; }; }; struct ieee80211_csa_settings { const u16 *counter_offsets_beacon; const u16 *counter_offsets_presp; int n_counter_offsets_beacon; int n_counter_offsets_presp; u8 count; }; struct ieee80211_color_change_settings { u16 counter_offset_beacon; u16 counter_offset_presp; u8 count; }; struct beacon_data { u8 *head, *tail; int head_len, tail_len; struct ieee80211_meshconf_ie *meshconf; u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u8 cntdwn_current_counter; struct cfg80211_mbssid_elems *mbssid_ies; struct cfg80211_rnr_elems *rnr_ies; struct rcu_head rcu_head; }; struct probe_resp { struct rcu_head rcu_head; int len; u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u8 data[]; }; struct fils_discovery_data { struct rcu_head rcu_head; int len; u8 data[]; }; struct unsol_bcast_probe_resp_data { struct rcu_head rcu_head; int len; u8 data[]; }; struct s1g_short_beacon_data { struct rcu_head rcu_head; u8 *short_head; u8 *short_tail; int short_head_len; int short_tail_len; }; struct ps_data { /* yes, this looks ugly, but guarantees that we can later use * bitmap_empty :) * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */ u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)] __aligned(__alignof__(unsigned long)); struct sk_buff_head bc_buf; atomic_t num_sta_ps; /* number of stations in PS mode */ int dtim_count; bool dtim_bc_mc; int sb_count; /* num short beacons til next long beacon */ }; struct ieee80211_if_ap { struct list_head vlans; /* write-protected with RTNL and local->mtx */ struct ps_data ps; atomic_t num_mcast_sta; /* number of stations receiving multicast */ bool multicast_to_unicast; bool active; }; struct ieee80211_if_vlan { struct list_head list; /* write-protected with RTNL and local->mtx */ /* used for all tx if the VLAN is configured to 4-addr mode */ struct sta_info __rcu *sta; atomic_t num_mcast_sta; /* number of stations receiving multicast */ }; struct mesh_stats { __u32 fwded_mcast; /* Mesh forwarded multicast frames */ __u32 fwded_unicast; /* Mesh forwarded unicast frames */ __u32 fwded_frames; /* Mesh total forwarded frames */ __u32 dropped_frames_ttl; /* Not transmitted since mesh_ttl == 0*/ __u32 dropped_frames_no_route; /* Not transmitted, no route found */ }; #define PREQ_Q_F_START 0x1 #define PREQ_Q_F_REFRESH 0x2 struct mesh_preq_queue { struct list_head list; u8 dst[ETH_ALEN]; u8 flags; }; struct ieee80211_roc_work { struct list_head list; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *chan; bool started, abort, hw_begun, notified; bool on_channel; unsigned long start_time; u32 duration, req_duration; struct sk_buff *frame; u64 cookie, mgmt_tx_cookie; enum ieee80211_roc_type type; }; /* flags used in struct ieee80211_if_managed.flags */ enum ieee80211_sta_flags { IEEE80211_STA_CONNECTION_POLL = BIT(1), IEEE80211_STA_CONTROL_PORT = BIT(2), IEEE80211_STA_MFP_ENABLED = BIT(6), IEEE80211_STA_UAPSD_ENABLED = BIT(7), IEEE80211_STA_NULLFUNC_ACKED = BIT(8), IEEE80211_STA_ENABLE_RRM = BIT(15), }; enum ieee80211_conn_mode { IEEE80211_CONN_MODE_S1G, IEEE80211_CONN_MODE_LEGACY, IEEE80211_CONN_MODE_HT, IEEE80211_CONN_MODE_VHT, IEEE80211_CONN_MODE_HE, IEEE80211_CONN_MODE_EHT, }; #define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_EHT enum ieee80211_conn_bw_limit { IEEE80211_CONN_BW_LIMIT_20, IEEE80211_CONN_BW_LIMIT_40, IEEE80211_CONN_BW_LIMIT_80, IEEE80211_CONN_BW_LIMIT_160, /* also 80+80 */ IEEE80211_CONN_BW_LIMIT_320, }; struct ieee80211_conn_settings { enum ieee80211_conn_mode mode; enum ieee80211_conn_bw_limit bw_limit; }; extern const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited; struct ieee80211_mgd_auth_data { struct cfg80211_bss *bss; unsigned long timeout; int tries; u16 algorithm, expected_transaction; unsigned long userspace_selectors[BITS_TO_LONGS(128)]; u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; bool done, waiting; bool peer_confirmed; bool timeout_started; int link_id; u8 ap_addr[ETH_ALEN] __aligned(2); u16 sae_trans, sae_status; size_t data_len; u8 data[]; }; struct ieee80211_mgd_assoc_data { struct { struct cfg80211_bss *bss; u8 addr[ETH_ALEN] __aligned(2); u8 ap_ht_param; struct ieee80211_vht_cap ap_vht_cap; size_t elems_len; u8 *elems; /* pointing to inside ie[] below */ struct ieee80211_conn_settings conn; u16 status; bool disabled; } link[IEEE80211_MLD_MAX_NUM_LINKS]; u8 ap_addr[ETH_ALEN] __aligned(2); /* this is for a workaround, so we use it only for non-MLO */ const u8 *supp_rates; u8 supp_rates_len; unsigned long timeout; int tries; u8 prev_ap_addr[ETH_ALEN]; u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len; bool wmm, uapsd; bool need_beacon; bool synced; bool timeout_started; bool comeback; /* whether the AP has requested association comeback */ bool s1g; bool spp_amsdu; s8 assoc_link_id; __le16 ext_mld_capa_ops; u8 fils_nonces[2 * FILS_NONCE_LEN]; u8 fils_kek[FILS_MAX_KEK_LEN]; size_t fils_kek_len; size_t ie_len; u8 *ie_pos; /* used to fill ie[] with link[].elems */ u8 ie[]; }; struct ieee80211_sta_tx_tspec { /* timestamp of the first packet in the time slice */ unsigned long time_slice_start; u32 admitted_time; /* in usecs, unlike over the air */ u8 tsid; s8 up; /* signed to be able to invalidate with -1 during teardown */ /* consumed TX time in microseconds in the time slice */ u32 consumed_tx_time; enum { TX_TSPEC_ACTION_NONE = 0, TX_TSPEC_ACTION_DOWNGRADE, TX_TSPEC_ACTION_STOP_DOWNGRADE, } action; bool downgraded; }; /* Advertised TID-to-link mapping info */ struct ieee80211_adv_ttlm_info { /* time in TUs at which the new mapping is established, or 0 if there is * no planned advertised TID-to-link mapping */ u16 switch_time; u32 duration; /* duration of the planned T2L map in TUs */ u16 map; /* map of usable links for all TIDs */ bool active; /* whether the advertised mapping is active or not */ }; DECLARE_EWMA(beacon_signal, 4, 4) struct ieee80211_if_managed { struct timer_list timer; struct timer_list conn_mon_timer; struct timer_list bcn_mon_timer; struct wiphy_work monitor_work; struct wiphy_work beacon_connection_loss_work; struct wiphy_work csa_connection_drop_work; unsigned long beacon_timeout; unsigned long probe_timeout; int probe_send_count; bool nullfunc_failed; u8 connection_loss:1, driver_disconnect:1, reconnect:1, associated:1; struct ieee80211_mgd_auth_data *auth_data; struct ieee80211_mgd_assoc_data *assoc_data; unsigned long userspace_selectors[BITS_TO_LONGS(128)]; bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ unsigned int flags; u16 mcast_seq_last; bool status_acked; bool status_received; __le16 status_fc; enum { IEEE80211_MFP_DISABLED, IEEE80211_MFP_OPTIONAL, IEEE80211_MFP_REQUIRED } mfp; /* management frame protection */ /* * Bitmask of enabled u-apsd queues, * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association * to take effect. */ unsigned int uapsd_queues; /* * Maximum number of buffered frames AP can deliver during a * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar. * Needs a new association to take effect. */ unsigned int uapsd_max_sp_len; u8 use_4addr; /* * State variables for keeping track of RSSI of the AP currently * connected to and informing driver when RSSI has gone * below/above a certain threshold. */ int rssi_min_thold, rssi_max_thold; struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */ struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */ struct ieee80211_s1g_cap s1g_capa; /* configured S1G overrides */ struct ieee80211_s1g_cap s1g_capa_mask; /* valid s1g_capa bits */ /* TDLS support */ u8 tdls_peer[ETH_ALEN] __aligned(2); struct wiphy_delayed_work tdls_peer_del_work; struct sk_buff *orig_teardown_skb; /* The original teardown skb */ struct sk_buff *teardown_skb; /* A copy to send through the AP */ spinlock_t teardown_lock; /* To lock changing teardown_skb */ bool tdls_wider_bw_prohibited; /* WMM-AC TSPEC support */ struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS]; /* Use a separate work struct so that we can do something here * while the sdata->work is flushing the queues, for example. * otherwise, in scenarios where we hardly get any traffic out * on the BE queue, but there's a lot of VO traffic, we might * get stuck in a downgraded situation and flush takes forever. */ struct wiphy_delayed_work tx_tspec_wk; /* Information elements from the last transmitted (Re)Association * Request frame. */ u8 *assoc_req_ies; size_t assoc_req_ies_len; struct wiphy_hrtimer_work ml_reconf_work; u16 removed_links; /* TID-to-link mapping support */ struct wiphy_hrtimer_work ttlm_work; struct ieee80211_adv_ttlm_info ttlm_info; struct wiphy_work teardown_ttlm_work; /* dialog token enumerator for neg TTLM request */ u8 dialog_token_alloc; struct wiphy_delayed_work neg_ttlm_timeout_work; /* Locally initiated multi-link reconfiguration */ struct { struct ieee80211_mgd_assoc_data *add_links_data; struct wiphy_delayed_work wk; u16 removed_links; u16 added_links; u8 dialog_token; } reconf; /* Support for epcs */ struct { bool enabled; u8 dialog_token; } epcs; }; struct ieee80211_if_ibss { struct timer_list timer; struct wiphy_work csa_connection_drop_work; unsigned long last_scan_completed; u32 basic_rates; bool fixed_bssid; bool fixed_channel; bool privacy; bool control_port; bool userspace_handles_dfs; u8 bssid[ETH_ALEN] __aligned(2); u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len, ie_len; u8 *ie; struct cfg80211_chan_def chandef; unsigned long ibss_join_req; /* probe response/beacon for IBSS */ struct beacon_data __rcu *presp; struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ spinlock_t incomplete_lock; struct list_head incomplete_stations; enum { IEEE80211_IBSS_MLME_SEARCH, IEEE80211_IBSS_MLME_JOINED, } state; }; /** * struct ieee80211_if_ocb - OCB mode state * * @housekeeping_timer: timer for periodic invocation of a housekeeping task * @wrkq_flags: OCB deferred task action * @incomplete_lock: delayed STA insertion lock * @incomplete_stations: list of STAs waiting for delayed insertion * @joined: indication if the interface is connected to an OCB network */ struct ieee80211_if_ocb { struct timer_list housekeeping_timer; unsigned long wrkq_flags; spinlock_t incomplete_lock; struct list_head incomplete_stations; bool joined; }; /** * struct ieee80211_mesh_sync_ops - Extensible synchronization framework interface * * these declarations define the interface, which enables * vendor-specific mesh synchronization * * @rx_bcn_presp: beacon/probe response was received * @adjust_tsf: TSF adjustment method */ struct ieee80211_mesh_sync_ops { void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata, u16 stype, struct ieee80211_mgmt *mgmt, unsigned int len, const struct ieee80211_meshconf_ie *mesh_cfg, struct ieee80211_rx_status *rx_status); /* should be called with beacon_data under RCU read lock */ void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon); /* add other framework functions here */ }; struct mesh_csa_settings { struct rcu_head rcu_head; struct cfg80211_csa_settings settings; }; /** * struct mesh_table - mesh hash table * * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. * @gates_lock: protects updates to known_gates * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr * @walk_head: linked list containing all mesh_path objects * @walk_lock: lock protecting walk_head * @entries: number of entries in the table */ struct mesh_table { struct hlist_head known_gates; spinlock_t gates_lock; struct rhashtable rhead; struct hlist_head walk_head; spinlock_t walk_lock; atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ }; /** * struct mesh_tx_cache - mesh fast xmit header cache * * @rht: hash table containing struct ieee80211_mesh_fast_tx, using skb DA as key * @walk_head: linked list containing all ieee80211_mesh_fast_tx objects * @walk_lock: lock protecting walk_head and rht */ struct mesh_tx_cache { struct rhashtable rht; struct hlist_head walk_head; spinlock_t walk_lock; }; struct ieee80211_if_mesh { struct timer_list housekeeping_timer; struct timer_list mesh_path_timer; struct timer_list mesh_path_root_timer; unsigned long wrkq_flags; unsigned long mbss_changed[64 / BITS_PER_LONG]; bool userspace_handles_dfs; u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN]; size_t mesh_id_len; /* Active Path Selection Protocol Identifier */ u8 mesh_pp_id; /* Active Path Selection Metric Identifier */ u8 mesh_pm_id; /* Congestion Control Mode Identifier */ u8 mesh_cc_id; /* Synchronization Protocol Identifier */ u8 mesh_sp_id; /* Authentication Protocol Identifier */ u8 mesh_auth_id; /* Local mesh Sequence Number */ u32 sn; /* Last used PREQ ID */ u32 preq_id; atomic_t mpaths; /* Timestamp of last SN update */ unsigned long last_sn_update; /* Time when it's ok to send next PERR */ unsigned long next_perr; /* Timestamp of last PREQ sent */ unsigned long last_preq; struct mesh_rmc *rmc; spinlock_t mesh_preq_queue_lock; struct mesh_preq_queue preq_queue; int preq_queue_len; struct mesh_stats mshstats; struct mesh_config mshcfg; atomic_t estab_plinks; atomic_t mesh_seqnum; bool accepting_plinks; int num_gates; struct beacon_data __rcu *beacon; const u8 *ie; u8 ie_len; enum { IEEE80211_MESH_SEC_NONE = 0x0, IEEE80211_MESH_SEC_AUTHED = 0x1, IEEE80211_MESH_SEC_SECURED = 0x2, } security; bool user_mpm; /* Extensible Synchronization Framework */ const struct ieee80211_mesh_sync_ops *sync_ops; s64 sync_offset_clockdrift_max; spinlock_t sync_offset_lock; /* mesh power save */ enum nl80211_mesh_power_mode nonpeer_pm; int ps_peers_light_sleep; int ps_peers_deep_sleep; struct ps_data ps; /* Channel Switching Support */ struct mesh_csa_settings __rcu *csa; enum { IEEE80211_MESH_CSA_ROLE_NONE, IEEE80211_MESH_CSA_ROLE_INIT, IEEE80211_MESH_CSA_ROLE_REPEATER, } csa_role; u8 chsw_ttl; u16 pre_value; /* offset from skb->data while building IE */ int meshconf_offset; struct mesh_table mesh_paths; struct mesh_table mpp_paths; /* Store paths for MPP&MAP */ int mesh_paths_generation; int mpp_paths_generation; struct mesh_tx_cache tx_cache; }; #ifdef CONFIG_MAC80211_MESH #define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ do { (msh)->mshstats.name++; } while (0) #else #define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ do { } while (0) #endif /** * enum ieee80211_sub_if_data_flags - virtual interface flags * * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between * associated stations and deliver multicast frames both * back to wireless media and to the local net stack. * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume. * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver * @IEEE80211_SDATA_DISCONNECT_HW_RESTART: Disconnect after hardware restart * recovery */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), IEEE80211_SDATA_IN_DRIVER = BIT(5), IEEE80211_SDATA_DISCONNECT_HW_RESTART = BIT(6), }; /** * enum ieee80211_sdata_state_bits - virtual interface state bits * @SDATA_STATE_RUNNING: virtual interface is up & running; this * mirrors netif_running() but is separate for interface type * change handling while the interface is up * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel * mode, so queues are stopped * @SDATA_STATE_OFFCHANNEL_BEACON_STOPPED: Beaconing was stopped due * to offchannel, reset when offchannel returns */ enum ieee80211_sdata_state_bits { SDATA_STATE_RUNNING, SDATA_STATE_OFFCHANNEL, SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, }; /** * enum ieee80211_chanctx_mode - channel context configuration mode * * @IEEE80211_CHANCTX_SHARED: channel context may be used by * multiple interfaces * @IEEE80211_CHANCTX_EXCLUSIVE: channel context can be used * only by a single interface. This can be used for example for * non-fixed channel IBSS. */ enum ieee80211_chanctx_mode { IEEE80211_CHANCTX_SHARED, IEEE80211_CHANCTX_EXCLUSIVE }; /** * enum ieee80211_chanctx_replace_state - channel context replacement state * * This is used for channel context in-place reservations that require channel * context switch/swap. * * @IEEE80211_CHANCTX_REPLACE_NONE: no replacement is taking place * @IEEE80211_CHANCTX_WILL_BE_REPLACED: this channel context will be replaced * by a (not yet registered) channel context pointed by %replace_ctx. * @IEEE80211_CHANCTX_REPLACES_OTHER: this (not yet registered) channel context * replaces an existing channel context pointed to by %replace_ctx. */ enum ieee80211_chanctx_replace_state { IEEE80211_CHANCTX_REPLACE_NONE, IEEE80211_CHANCTX_WILL_BE_REPLACED, IEEE80211_CHANCTX_REPLACES_OTHER, }; struct ieee80211_chanctx { struct list_head list; struct rcu_head rcu_head; enum ieee80211_chanctx_replace_state replace_state; struct ieee80211_chanctx *replace_ctx; enum ieee80211_chanctx_mode mode; bool driver_present; /* temporary data for search algorithm etc. */ struct ieee80211_chan_req req; bool radar_detected; /* MUST be last - ends in a flexible-array member. */ struct ieee80211_chanctx_conf conf; }; struct mac80211_qos_map { struct cfg80211_qos_map qos_map; struct rcu_head rcu_head; }; enum txq_info_flags { IEEE80211_TXQ_STOP, IEEE80211_TXQ_AMPDU, IEEE80211_TXQ_NO_AMSDU, IEEE80211_TXQ_DIRTY, }; /** * struct txq_info - per tid queue * * @tin: contains packets split into multiple flows * @def_cvars: codel vars for the @tin's default_flow * @cstats: code statistics for this queue * @frags: used to keep fragments created after dequeue * @schedule_order: used with ieee80211_local->active_txqs * @schedule_round: counter to prevent infinite loops on TXQ scheduling * @flags: TXQ flags from &enum txq_info_flags * @txq: the driver visible part */ struct txq_info { struct fq_tin tin; struct codel_vars def_cvars; struct codel_stats cstats; u16 schedule_round; struct list_head schedule_order; struct sk_buff_head frags; unsigned long flags; /* keep last! */ struct ieee80211_txq txq; }; struct ieee80211_if_mntr { u32 flags; u8 mu_follow_addr[ETH_ALEN] __aligned(2); struct list_head list; }; /** * struct ieee80211_if_nan - NAN state * * @conf: current NAN configuration * @started: true iff NAN is started * @func_lock: lock for @func_inst_ids * @function_inst_ids: a bitmap of available instance_id's */ struct ieee80211_if_nan { struct cfg80211_nan_conf conf; bool started; /* protects function_inst_ids */ spinlock_t func_lock; struct idr function_inst_ids; }; struct ieee80211_link_data_managed { u8 bssid[ETH_ALEN] __aligned(2); u8 dtim_period; enum ieee80211_smps_mode req_smps, /* requested smps mode */ driver_smps_mode; /* smps mode request */ struct ieee80211_conn_settings conn; s16 p2p_noa_index; bool tdls_chan_switch_prohibited; bool have_beacon; bool tracking_signal_avg; bool disable_wmm_tracking; bool operating_11g_mode; struct { struct wiphy_hrtimer_work switch_work; struct cfg80211_chan_def ap_chandef; struct ieee80211_parsed_tpe tpe; ktime_t time; bool waiting_bcn; bool ignored_same_chan; bool blocked_tx; } csa; struct wiphy_work request_smps_work; /* used to reconfigure hardware SM PS */ struct wiphy_work recalc_smps; bool beacon_crc_valid; u32 beacon_crc; struct ewma_beacon_signal ave_beacon_signal; int last_ave_beacon_signal; /* * Number of Beacon frames used in ave_beacon_signal. This can be used * to avoid generating less reliable cqm events that would be based * only on couple of received frames. */ unsigned int count_beacon_signal; /* Number of times beacon loss was invoked. */ unsigned int beacon_loss_count; /* * Last Beacon frame signal strength average (ave_beacon_signal / 16) * that triggered a cqm event. 0 indicates that no event has been * generated for the current association. */ int last_cqm_event_signal; int wmm_last_param_set; int mu_edca_last_param_set; }; struct ieee80211_link_data_ap { struct beacon_data __rcu *beacon; struct probe_resp __rcu *probe_resp; struct fils_discovery_data __rcu *fils_discovery; struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp; struct s1g_short_beacon_data __rcu *s1g_short_beacon; /* to be used after channel switch. */ struct cfg80211_beacon_data *next_beacon; }; struct ieee80211_link_data { struct ieee80211_sub_if_data *sdata; unsigned int link_id; /* multicast keys only */ struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS]; struct ieee80211_key __rcu *default_multicast_key; struct ieee80211_key __rcu *default_mgmt_key; struct ieee80211_key __rcu *default_beacon_key; bool operating_11g_mode; struct { struct wiphy_work finalize_work; struct ieee80211_chan_req chanreq; } csa; struct wiphy_work color_change_finalize_work; struct wiphy_delayed_work color_collision_detect_work; u64 color_bitmap; /* context reservation -- protected with wiphy mutex */ struct ieee80211_chanctx *reserved_chanctx; struct ieee80211_chan_req reserved; bool reserved_radar_required; bool reserved_ready; u8 needed_rx_chains; enum ieee80211_smps_mode smps_mode; int user_power_level; /* in dBm */ int ap_power_level; /* in dBm */ bool radar_required; struct wiphy_delayed_work dfs_cac_timer_work; union { struct ieee80211_link_data_managed mgd; struct ieee80211_link_data_ap ap; } u; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct ieee80211_bss_conf *conf; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif }; struct ieee80211_sub_if_data { struct list_head list; struct wireless_dev wdev; /* keys */ struct list_head key_list; /* count for keys needing tailroom space allocation */ int crypto_tx_tailroom_needed_cnt; int crypto_tx_tailroom_pending_dec; struct wiphy_delayed_work dec_tailroom_needed_wk; struct net_device *dev; struct ieee80211_local *local; unsigned int flags; unsigned long state; char name[IFNAMSIZ]; struct ieee80211_fragment_cache frags; /* TID bitmap for NoAck policy */ u16 noack_map; /* bit field of ACM bits (BIT(802.1D tag)) */ u8 wmm_acm; struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS]; struct ieee80211_key __rcu *default_unicast_key; u16 sequence_number; u16 mld_mcast_seq; __be16 control_port_protocol; bool control_port_no_encrypt; bool control_port_no_preauth; bool control_port_over_nl80211; atomic_t num_tx_queued; struct mac80211_qos_map __rcu *qos_map; struct wiphy_work work; struct sk_buff_head skb_queue; struct sk_buff_head status_queue; /* * AP this belongs to: self in AP mode and * corresponding AP in VLAN mode, NULL for * all others (might be needed later in IBSS) */ struct ieee80211_if_ap *bss; /* bitmap of allowed (non-MCS) rate indexes for rate control */ u32 rc_rateidx_mask[NUM_NL80211_BANDS]; bool rc_has_mcs_mask[NUM_NL80211_BANDS]; u8 rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN]; bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; /* Beacon frame (non-MCS) rate (as a bitmap) */ u32 beacon_rateidx_mask[NUM_NL80211_BANDS]; bool beacon_rate_set; union { struct ieee80211_if_ap ap; struct ieee80211_if_vlan vlan; struct ieee80211_if_managed mgd; struct ieee80211_if_ibss ibss; struct ieee80211_if_mesh mesh; struct ieee80211_if_ocb ocb; struct ieee80211_if_mntr mntr; struct ieee80211_if_nan nan; } u; struct ieee80211_link_data deflink; struct ieee80211_link_data __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; /* for ieee80211_set_active_links_async() */ struct wiphy_work activate_links_work; u16 desired_active_links; u16 restart_active_links; #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *subdir_stations; struct dentry *default_unicast_key; struct dentry *default_multicast_key; struct dentry *default_mgmt_key; struct dentry *default_beacon_key; } debugfs; #endif u32 tx_handlers_drop; /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; static inline struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) { return container_of(p, struct ieee80211_sub_if_data, vif); } #define sdata_dereference(p, sdata) \ wiphy_dereference(sdata->local->hw.wiphy, p) #define for_each_sdata_link(_local, _link) \ /* outer loop just to define the variables ... */ \ for (struct ieee80211_sub_if_data *___sdata = NULL; \ !___sdata; \ ___sdata = (void *)~0 /* always stop */) \ for (int ___link_id = ARRAY_SIZE(___sdata->link); \ ___link_id; ___link_id = 0 /* always stop */) \ list_for_each_entry(___sdata, &(_local)->interfaces, list) \ if (___link_id == ARRAY_SIZE(___sdata->link) && \ ieee80211_sdata_running(___sdata)) \ for (___link_id = 0; \ ___link_id < ARRAY_SIZE(___sdata->link); \ ___link_id++) \ if ((_link = wiphy_dereference((_local)->hw.wiphy, \ ___sdata->link[___link_id]))) /* * for_each_sdata_link_rcu() must be used under RCU read lock. */ #define for_each_sdata_link_rcu(_local, _link) \ /* outer loop just to define the variables ... */ \ for (struct ieee80211_sub_if_data *___sdata = NULL; \ !___sdata; \ ___sdata = (void *)~0 /* always stop */) \ for (int ___link_id = ARRAY_SIZE(___sdata->link); \ ___link_id; ___link_id = 0 /* always stop */) \ list_for_each_entry(___sdata, &(_local)->interfaces, list) \ if (___link_id == ARRAY_SIZE(___sdata->link) && \ ieee80211_sdata_running(___sdata)) \ for (___link_id = 0; \ ___link_id < ARRAY_SIZE((___sdata)->link); \ ___link_id++) \ if ((_link = rcu_dereference((___sdata)->link[___link_id]))) #define for_each_link_data(sdata, __link) \ /* outer loop just to define the variable ... */ \ for (struct ieee80211_sub_if_data *__sdata = (sdata); __sdata; \ __sdata = NULL /* always stop */) \ for (int __link_id = 0; \ __link_id < ARRAY_SIZE((__sdata)->link); __link_id++) \ if ((!(__sdata)->vif.valid_links || \ (__sdata)->vif.valid_links & BIT(__link_id)) && \ ((__link) = sdata_dereference((__sdata)->link[__link_id], \ (__sdata)))) /* * for_each_link_data_rcu should be used under RCU read lock. */ #define for_each_link_data_rcu(sdata, __link) \ /* outer loop just to define the variable ... */ \ for (struct ieee80211_sub_if_data *__sdata = (sdata); __sdata; \ __sdata = NULL /* always stop */) \ for (int __link_id = 0; \ __link_id < ARRAY_SIZE((__sdata)->link); __link_id++) \ if ((!(__sdata)->vif.valid_links || \ (__sdata)->vif.valid_links & BIT(__link_id)) && \ ((__link) = rcu_dereference((__sdata)->link[__link_id]))) \ static inline int ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems, struct cfg80211_rnr_elems *rnr_elems, u8 i) { int len = 0; if (!elems || !elems->cnt || i > elems->cnt) return 0; if (i < elems->cnt) { len = elems->elem[i].len; if (rnr_elems) { len += rnr_elems->elem[i].len; for (i = elems->cnt; i < rnr_elems->cnt; i++) len += rnr_elems->elem[i].len; } return len; } /* i == elems->cnt, calculate total length of all MBSSID elements */ for (i = 0; i < elems->cnt; i++) len += elems->elem[i].len; if (rnr_elems) { for (i = 0; i < rnr_elems->cnt; i++) len += rnr_elems->elem[i].len; } return len; } enum { IEEE80211_RX_MSG = 1, IEEE80211_TX_STATUS_MSG = 2, }; enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_DRIVER, IEEE80211_QUEUE_STOP_REASON_PS, IEEE80211_QUEUE_STOP_REASON_CSA, IEEE80211_QUEUE_STOP_REASON_AGGREGATION, IEEE80211_QUEUE_STOP_REASON_SUSPEND, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, IEEE80211_QUEUE_STOP_REASON_FLUSH, IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID, IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE, IEEE80211_QUEUE_STOP_REASONS, }; #ifdef CONFIG_MAC80211_LEDS struct tpt_led_trigger { char name[32]; const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; struct timer_list timer; struct ieee80211_local *local; unsigned long prev_traffic; unsigned long tx_bytes, rx_bytes; unsigned int active, want; bool running; }; #endif /** * enum mac80211_scan_flags - currently active scan mode * * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as * well be on the operating channel * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to * determine if we are on the operating channel or not * @SCAN_ONCHANNEL_SCANNING: Do a software scan on only the current operating * channel. This should not interrupt normal traffic. * @SCAN_COMPLETED: Set for our scan work function when the driver reported * that the scan completed. * @SCAN_ABORTED: Set for our scan work function when the driver reported * a scan complete for an aborted scan. * @SCAN_HW_CANCELLED: Set for our scan work function when the scan is being * cancelled. * @SCAN_BEACON_WAIT: Set whenever we're passive scanning because of radar/no-IR * and could send a probe request after receiving a beacon. * @SCAN_BEACON_DONE: Beacon received, we can now send a probe request */ enum mac80211_scan_flags { SCAN_SW_SCANNING, SCAN_HW_SCANNING, SCAN_ONCHANNEL_SCANNING, SCAN_COMPLETED, SCAN_ABORTED, SCAN_HW_CANCELLED, SCAN_BEACON_WAIT, SCAN_BEACON_DONE, }; /** * enum mac80211_scan_state - scan state machine states * * @SCAN_DECISION: Main entry point to the scan state machine, this state * determines if we should keep on scanning or switch back to the * operating channel * @SCAN_SET_CHANNEL: Set the next channel to be scanned * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses * @SCAN_SUSPEND: Suspend the scan and go back to operating channel to * send out data * @SCAN_RESUME: Resume the scan and scan the next channel * @SCAN_ABORT: Abort the scan and go back to operating channel */ enum mac80211_scan_state { SCAN_DECISION, SCAN_SET_CHANNEL, SCAN_SEND_PROBE, SCAN_SUSPEND, SCAN_RESUME, SCAN_ABORT, }; DECLARE_STATIC_KEY_FALSE(aql_disable); struct ieee80211_local { /* embed the driver visible part. * don't cast (use the static inlines below), but we keep * it first anyway so they become a no-op */ struct ieee80211_hw hw; struct fq fq; struct codel_vars *cvars; struct codel_params cparams; /* protects active_txqs and txqi->schedule_order */ spinlock_t active_txq_lock[IEEE80211_NUM_ACS]; struct list_head active_txqs[IEEE80211_NUM_ACS]; u16 schedule_round[IEEE80211_NUM_ACS]; /* serializes ieee80211_handle_wake_tx_queue */ spinlock_t handle_wake_tx_queue_lock; u16 airtime_flags; u32 aql_txq_limit_low[IEEE80211_NUM_ACS]; u32 aql_txq_limit_high[IEEE80211_NUM_ACS]; u32 aql_threshold; atomic_t aql_total_pending_airtime; atomic_t aql_ac_pending_airtime[IEEE80211_NUM_ACS]; const struct ieee80211_ops *ops; /* * private workqueue to mac80211. mac80211 makes this accessible * via ieee80211_queue_work() */ struct workqueue_struct *workqueue; unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; int q_stop_reasons[IEEE80211_MAX_QUEUES][IEEE80211_QUEUE_STOP_REASONS]; /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ spinlock_t queue_stop_reason_lock; int open_count; int monitors, virt_monitors, tx_mntrs; /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; bool probe_req_reg; bool rx_mcast_action_reg; unsigned int filter_flags; /* FIF_* */ struct cfg80211_chan_def dflt_chandef; bool emulate_chanctx; /* protects the aggregated multicast list and filter calls */ spinlock_t filter_lock; /* used for uploading changed mc list */ struct wiphy_work reconfig_filter; /* aggregated multicast list */ struct netdev_hw_addr_list mc_list; bool tim_in_locked_section; /* see ieee80211_beacon_get() */ /* * suspended is true if we finished all the suspend _and_ we have * not yet come up from resume. This is to be used by mac80211 * to ensure driver sanity during suspend and mac80211's own * sanity. It can eventually be used for WoW as well. */ bool suspended; /* suspending is true during the whole suspend process */ bool suspending; /* * Resuming is true while suspended, but when we're reprogramming the * hardware -- at that time it's allowed to use ieee80211_queue_work() * again even though some other parts of the stack are still suspended * and we still drop received frames to avoid waking the stack. */ bool resuming; /* * quiescing is true during the suspend process _only_ to * ease timer cancelling etc. */ bool quiescing; /* device is started */ bool started; /* device is during a HW reconfig */ bool in_reconfig; /* reconfiguration failed ... suppress some warnings etc. */ bool reconfig_failure; /* wowlan is enabled -- don't reconfig on resume */ bool wowlan; struct wiphy_work radar_detected_work; /* number of RX chains the hardware has */ u8 rx_chains; /* bitmap of which sbands were copied */ u8 sband_allocated; int tx_headroom; /* required headroom for hardware/radiotap */ /* Tasklet and skb queue to process calls from IRQ mode. All frames * added to skb_queue will be processed, but frames in * skb_queue_unreliable may be dropped if the total length of these * queues increases over the limit. */ #define IEEE80211_IRQSAFE_QUEUE_LIMIT 128 struct tasklet_struct tasklet; struct sk_buff_head skb_queue; struct sk_buff_head skb_queue_unreliable; spinlock_t rx_path_lock; /* Station data */ /* * The list, hash table and counter are protected * by the wiphy mutex, reads are done with RCU. */ spinlock_t tim_lock; unsigned long num_sta; struct list_head sta_list; struct rhltable sta_hash; struct rhltable link_sta_hash; struct timer_list sta_cleanup; int sta_generation; struct sk_buff_head pending[IEEE80211_MAX_QUEUES]; struct tasklet_struct tx_pending_tasklet; struct tasklet_struct wake_txqs_tasklet; atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; /* number of interfaces with allmulti RX */ atomic_t iff_allmultis; struct rate_control_ref *rate_ctrl; struct arc4_ctx wep_tx_ctx; struct arc4_ctx wep_rx_ctx; u32 wep_iv; /* see iface.c */ struct list_head interfaces; struct list_head mon_list; /* only that are IFF_UP */ struct mutex iflist_mtx; /* Scanning and BSS list */ unsigned long scanning; struct cfg80211_ssid scan_ssid; struct cfg80211_scan_request *int_scan_req; struct cfg80211_scan_request __rcu *scan_req; struct ieee80211_scan_request *hw_scan_req; struct cfg80211_chan_def scan_chandef; enum nl80211_band hw_scan_band; int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; struct cfg80211_scan_info scan_info; struct wiphy_work sched_scan_stopped_work; struct ieee80211_sub_if_data __rcu *sched_scan_sdata; struct cfg80211_sched_scan_request __rcu *sched_scan_req; u8 scan_addr[ETH_ALEN]; unsigned long leave_oper_channel_time; enum mac80211_scan_state next_scan_state; struct wiphy_delayed_work scan_work; struct ieee80211_sub_if_data __rcu *scan_sdata; /* Temporary remain-on-channel for off-channel operations */ struct ieee80211_channel *tmp_channel; /* channel contexts */ struct list_head chanctx_list; #ifdef CONFIG_MAC80211_LEDS struct led_trigger tx_led, rx_led, assoc_led, radio_led; struct led_trigger tpt_led; atomic_t tx_led_active, rx_led_active, assoc_led_active; atomic_t radio_led_active, tpt_led_active; struct tpt_led_trigger *tpt_led_trigger; #endif #ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* SNMP counters */ /* dot11CountersTable */ u32 dot11TransmittedFragmentCount; u32 dot11MulticastTransmittedFrameCount; u32 dot11FailedCount; u32 dot11RetryCount; u32 dot11MultipleRetryCount; u32 dot11FrameDuplicateCount; u32 dot11ReceivedFragmentCount; u32 dot11MulticastReceivedFrameCount; u32 dot11TransmittedFrameCount; /* TX/RX handler statistics */ unsigned int tx_handlers_queued; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; unsigned int tx_handlers_drop_unauth_port; unsigned int rx_handlers_drop; unsigned int rx_handlers_queued; unsigned int rx_handlers_drop_nullfunc; unsigned int rx_handlers_drop_defrag; unsigned int tx_expand_skb_head; unsigned int tx_expand_skb_head_cloned; unsigned int rx_expand_skb_head_defrag; unsigned int rx_handlers_fragments; unsigned int tx_status_drop; #define I802_DEBUG_INC(c) (c)++ #else /* CONFIG_MAC80211_DEBUG_COUNTERS */ #define I802_DEBUG_INC(c) do { } while (0) #endif /* CONFIG_MAC80211_DEBUG_COUNTERS */ int total_ps_buffered; /* total number of all buffered unicast and * multicast packets for power saving stations */ bool pspolling; /* * PS can only be enabled when we have exactly one managed * interface (and monitors) in PS, this then points there. */ struct ieee80211_sub_if_data *ps_sdata; struct wiphy_work dynamic_ps_enable_work; struct wiphy_work dynamic_ps_disable_work; struct timer_list dynamic_ps_timer; struct notifier_block ifa_notifier; struct notifier_block ifa6_notifier; /* * The dynamic ps timeout configured from user space via WEXT - * this will override whatever chosen by mac80211 internally. */ int dynamic_ps_forced_timeout; int user_power_level; /* in dBm, for all interfaces */ struct work_struct restart_work; #ifdef CONFIG_MAC80211_DEBUGFS struct local_debugfsdentries { struct dentry *rcdir; struct dentry *keys; } debugfs; bool force_tx_status; #endif /* * Remain-on-channel support */ struct wiphy_delayed_work roc_work; struct list_head roc_list; struct wiphy_work hw_roc_start, hw_roc_done; unsigned long hw_roc_start_time; u64 roc_cookie_counter; struct idr ack_status_frames; spinlock_t ack_status_lock; /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct ieee80211_chan_req monitor_chanreq; /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; bool wbrf_supported; }; static inline struct ieee80211_sub_if_data * IEEE80211_DEV_TO_SUB_IF(const struct net_device *dev) { return netdev_priv(dev); } static inline struct ieee80211_sub_if_data * IEEE80211_WDEV_TO_SUB_IF(struct wireless_dev *wdev) { return container_of(wdev, struct ieee80211_sub_if_data, wdev); } static inline struct ieee80211_supported_band * ieee80211_get_sband(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; WARN_ON(ieee80211_vif_is_mld(&sdata->vif)); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return NULL; } band = chanctx_conf->def.chan->band; rcu_read_unlock(); return local->hw.wiphy->bands[band]; } static inline struct ieee80211_supported_band * ieee80211_get_link_sband(struct ieee80211_link_data *link) { struct ieee80211_local *local = link->sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return NULL; } band = chanctx_conf->def.chan->band; rcu_read_unlock(); return local->hw.wiphy->bands[band]; } /* this struct holds the value parsing from channel switch IE */ struct ieee80211_csa_ie { struct ieee80211_chan_req chanreq; u8 mode; u8 count; u8 ttl; u16 pre_value; u16 reason_code; u32 max_switch_time; }; enum ieee80211_elems_parse_error { IEEE80211_PARSE_ERR_INVALID_END = BIT(0), IEEE80211_PARSE_ERR_DUP_ELEM = BIT(1), IEEE80211_PARSE_ERR_BAD_ELEM_SIZE = BIT(2), IEEE80211_PARSE_ERR_UNEXPECTED_ELEM = BIT(3), IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC = BIT(4), }; /* Parsed Information Elements */ struct ieee802_11_elems { const u8 *ie_start; size_t total_len; u32 crc; /* pointers to IEs */ const struct ieee80211_tdls_lnkie *lnk_id; const struct ieee80211_ch_switch_timing *ch_sw_timing; const u8 *ext_capab; const u8 *ssid; const u8 *supp_rates; const u8 *ds_params; const struct ieee80211_tim_ie *tim; const u8 *rsn; const u8 *rsnx; const u8 *erp_info; const u8 *ext_supp_rates; const u8 *wmm_info; const u8 *wmm_param; const struct ieee80211_ht_cap *ht_cap_elem; const struct ieee80211_ht_operation *ht_operation; const struct ieee80211_vht_cap *vht_cap_elem; const struct ieee80211_vht_operation *vht_operation; const struct ieee80211_meshconf_ie *mesh_config; const u8 *he_cap; const struct ieee80211_he_operation *he_operation; const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; const __le16 *awake_window; const u8 *preq; const u8 *prep; const u8 *perr; const struct ieee80211_rann_ie *rann; const struct ieee80211_channel_sw_ie *ch_switch_ie; const struct ieee80211_ext_chansw_ie *ext_chansw_ie; const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const u8 *max_channel_switch_time; const u8 *country_elem; const u8 *pwr_constr_elem; const u8 *cisco_dtpc_elem; const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie; const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie; const struct ieee80211_multiple_bssid_configuration *mbssid_config_ie; const struct ieee80211_bssid_index *bssid_index; u8 max_bssid_indicator; u8 dtim_count; u8 dtim_period; const struct ieee80211_addba_ext_ie *addba_ext_ie; const struct ieee80211_s1g_cap *s1g_capab; const struct ieee80211_s1g_oper_ie *s1g_oper; const struct ieee80211_s1g_bcn_compat_ie *s1g_bcn_compat; const struct ieee80211_aid_response_ie *aid_resp; const struct ieee80211_eht_cap_elem *eht_cap; const struct ieee80211_eht_operation *eht_operation; const struct ieee80211_multi_link_elem *ml_basic; const struct ieee80211_multi_link_elem *ml_reconf; const struct ieee80211_multi_link_elem *ml_epcs; const struct ieee80211_bandwidth_indication *bandwidth_indication; const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT]; /* not the order in the psd values is per element, not per chandef */ struct ieee80211_parsed_tpe tpe; struct ieee80211_parsed_tpe csa_tpe; /* length of them, respectively */ u8 ext_capab_len; u8 ssid_len; u8 supp_rates_len; u8 tim_len; u8 rsn_len; u8 rsnx_len; u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; u8 he_cap_len; u8 mesh_id_len; u8 peering_len; u8 preq_len; u8 prep_len; u8 perr_len; u8 country_elem_len; u8 bssid_index_len; u8 eht_cap_len; /* mult-link element can be de-fragmented and thus u8 is not sufficient */ size_t ml_basic_len; size_t ml_reconf_len; size_t ml_epcs_len; u8 ttlm_num; /* * store the per station profile pointer and length in case that the * parsing also handled Multi-Link element parsing for a specific link * ID. */ struct ieee80211_mle_per_sta_profile *prof; size_t sta_prof_len; /* whether/which parse error occurred while retrieving these elements */ u8 parse_error; }; static inline struct ieee80211_local *hw_to_local( struct ieee80211_hw *hw) { return container_of(hw, struct ieee80211_local, hw); } static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq) { return container_of(txq, struct txq_info, txq); } static inline bool txq_has_queue(struct ieee80211_txq *txq) { struct txq_info *txqi = to_txq_info(txq); return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets); } static inline bool ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) { return status->flag & RX_FLAG_MACTIME; } void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_block_queues_csa(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_unblock_queues_csa(struct ieee80211_sub_if_data *sdata); /* This function returns the number of multicast stations connected to this * interface. It returns -1 if that number is not tracked, that is for netdevs * not in AP or AP_VLAN mode or when using 4addr. */ static inline int ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_AP) return atomic_read(&sdata->u.ap.num_mcast_sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta) return atomic_read(&sdata->u.vlan.num_mcast_sta); return -1; } u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset); int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx, u32 changed); int ieee80211_hw_conf_chan(struct ieee80211_local *local); void ieee80211_hw_conf_init(struct ieee80211_local *local); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u64 changed); void ieee80211_vif_cfg_change_notify(struct ieee80211_sub_if_data *sdata, u64 changed); void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, u64 changed); void ieee80211_configure_filter(struct ieee80211_local *local); u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); void ieee80211_handle_queued_frames(struct ieee80211_local *local); u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp); void ieee80211_check_fast_rx(struct sta_info *sta); void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_rx(struct sta_info *sta); bool ieee80211_is_our_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr, int *out_link_id); /* STA code */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct cfg80211_auth_request *req); int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req); int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct cfg80211_deauth_request *req); int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_disassoc_request *req); void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_ps(struct ieee80211_local *local); void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, __le16 fc, bool acked); void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, u8 reason, bool tx); void ieee80211_mgd_setup_link(struct ieee80211_link_data *link); void ieee80211_mgd_stop_link(struct ieee80211_link_data *link); void ieee80211_mgd_set_link_qos_params(struct ieee80211_link_data *link); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates); int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params); int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings, u64 *changed); int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed); void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata); /* OCB code */ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates); void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, struct ocb_setup *setup); int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata); /* mesh code */ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings, u64 *changed); int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed); /* scan/BSS handling */ void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); void ieee80211_scan_cancel(struct ieee80211_local *local); void ieee80211_run_deferred_scan(struct ieee80211_local *local); void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *bss, const struct cfg80211_bss_ies *ies, void *data); void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local); struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel); void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss); /* scheduled scan handling */ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_stop(struct ieee80211_local *local); void ieee80211_sched_scan_end(struct ieee80211_local *local); void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work); /* off-channel/mgmt-tx */ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local); void ieee80211_offchannel_return(struct ieee80211_local *local); void ieee80211_roc_setup(struct ieee80211_local *local); void ieee80211_start_next_roc(struct ieee80211_local *local); void ieee80211_reconfig_roc(struct ieee80211_local *local); void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie); int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie); int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie); int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie); /* channel switch handling */ void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params); /* color change handling */ void ieee80211_color_change_finalize_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_color_collision_detection_work(struct wiphy *wiphy, struct wiphy_work *work); /* interface handling */ #define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_TC) #define MAC80211_SUPPORTED_FEATURES_RX (NETIF_F_RXCSUM) #define MAC80211_SUPPORTED_FEATURES (MAC80211_SUPPORTED_FEATURES_TX | \ MAC80211_SUPPORTED_FEATURES_RX) int ieee80211_iface_init(void); void ieee80211_iface_exit(void); int ieee80211_if_add(struct ieee80211_local *local, const char *name, unsigned char name_assign_type, struct wireless_dev **new_wdev, enum nl80211_iftype type, struct vif_params *params); int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type); void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata); void ieee80211_remove_interfaces(struct ieee80211_local *local); u32 ieee80211_idle_off(struct ieee80211_local *local); void ieee80211_recalc_idle(struct ieee80211_local *local); void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset); int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up); void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata); int ieee80211_add_virtual_monitor(struct ieee80211_local *local, struct ieee80211_sub_if_data *creator_sdata); void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link); void ieee80211_recalc_txpower(struct ieee80211_link_data *link, bool update_bss); void ieee80211_recalc_offload(struct ieee80211_local *local); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { return test_bit(SDATA_STATE_RUNNING, &sdata->state); } /* link handling */ void ieee80211_link_setup(struct ieee80211_link_data *link); void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, int link_id, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf); void ieee80211_link_stop(struct ieee80211_link_data *link); int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata, u16 new_links, u16 dormant_links); static inline void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata) { ieee80211_vif_set_links(sdata, 0, 0); } void ieee80211_apvlan_link_setup(struct ieee80211_sub_if_data *sdata); void ieee80211_apvlan_link_clear(struct ieee80211_sub_if_data *sdata); /* tx handling */ void ieee80211_clear_tx_pending(struct ieee80211_local *local); void ieee80211_tx_pending(struct tasklet_struct *t); netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct net_device *dev); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, u32 ctrl_flags, u64 *cookie); struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, struct ieee80211_tx_status *status); void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, int link_id, u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb, bool ampdu, const u8 *da, const u8 *sa); void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap); bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_cap *ht_cap_ie, struct link_sta_info *link_sta); void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code); int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid, int link_id); void ieee80211_add_addbaext(struct sk_buff *skb, const u8 req_addba_ext_data, u16 buf_size); u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, const void *elem_data, ssize_t elem_len, u16 *buf_size); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, const u8 addba_ext_data); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_addba_resp(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); static inline struct ieee80211_mgmt * ieee80211_mgmt_ba(struct sk_buff *skb, const u8 *da, struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgmt *mgmt = skb_put_zero(skb, 24); ether_addr_copy(mgmt->da, da); ether_addr_copy(mgmt->sa, sdata->vif.addr); if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ether_addr_copy(mgmt->bssid, sdata->vif.addr); else if (sdata->vif.type == NL80211_IFTYPE_STATION) ether_addr_copy(mgmt->bssid, sdata->vif.cfg.ap_addr); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) ether_addr_copy(mgmt->bssid, sdata->u.ibss.bssid); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); return mgmt; } int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_agg_stop_reason reason); void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs); enum nl80211_smps_mode ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps); void ieee80211_ht_handle_chanwidth_notif(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct link_sta_info *link_sta, u8 chanwidth, enum nl80211_band band); /* VHT */ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta); enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef); static inline enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta) { return _ieee80211_sta_cap_rx_bw(link_sta, NULL); } enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef); static inline enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta) { return _ieee80211_sta_cur_vht_bw(link_sta, NULL); } void ieee80211_sta_init_nss(struct link_sta_info *link_sta); enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta); void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *sta, u8 opmode, enum nl80211_band band); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *sta, u8 opmode, enum nl80211_band band); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]); enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *sta); /* HE */ void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct link_sta_info *link_sta); void ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, const struct ieee80211_he_spr *he_spr_ie_elem); void ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, const struct ieee80211_he_operation *he_op_ie_elem); /* S1G */ void ieee80211_s1g_sta_rate_init(struct sta_info *sta); bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb); void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, const struct ieee80211_s1g_cap *s1g_cap_ie, struct link_sta_info *link_sta); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); /** * ieee80211_parse_ch_switch_ie - parses channel switch IEs * @sdata: the sdata of the interface which has received the frame * @elems: parsed 802.11 elements received with the frame * @current_band: indicates the current band * @vht_cap_info: VHT capabilities of the transmitter * @conn: contains information about own capabilities and restrictions * to decide which channel switch announcements can be accepted * @bssid: the currently connected bssid (for reporting) * @unprot_action: whether the frame was an unprotected frame or not, * used for reporting * @csa_ie: parsed 802.11 csa elements on count, mode, chandef and mesh ttl. * All of them will be filled with if success only. * Return: 0 on success, <0 on error and >0 if there is nothing to parse. */ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, u32 vht_cap_info, struct ieee80211_conn_settings *conn, u8 *bssid, bool unprot_action, struct ieee80211_csa_ie *csa_ie); /* Suspend/resume and hw reconfiguration */ int ieee80211_reconfig(struct ieee80211_local *local); void ieee80211_stop_device(struct ieee80211_local *local, bool suspend); int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); static inline int __ieee80211_resume(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) && !test_bit(SCAN_COMPLETED, &local->scanning), "%s: resume with hardware scan still in progress\n", wiphy_name(hw->wiphy)); return ieee80211_reconfig(hw_to_local(hw)); } /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode); enum ieee80211_conn_bw_limit ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef); int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac); void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe); void ieee80211_set_wmm_default(struct ieee80211_link_data *link, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id, enum nl80211_band band); /* sta_out needs to be checked for ERR_PTR() before using */ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info **sta_out); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, enum nl80211_band band) { rcu_read_lock(); __ieee80211_tx_skb_tid_band(sdata, skb, tid, -1, band); rcu_read_unlock(); } void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id); static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */ ieee80211_tx_skb_tid(sdata, skb, 7, -1); } /** * struct ieee80211_elems_parse_params - element parsing parameters * @mode: connection mode for parsing * @start: pointer to the elements * @len: length of the elements * @type: type of the frame the elements came from * (action, probe response, beacon, etc.) * @filter: bitmap of element IDs to filter out while calculating * the element CRC * @crc: CRC starting value * @bss: the BSS to parse this as, for multi-BSSID cases this can * represent a non-transmitting BSS in which case the data * for that non-transmitting BSS is returned * @link_id: the link ID to parse elements for, if a STA profile * is present in the multi-link element, or -1 to ignore; * note that the code currently assumes parsing an association * (or re-association) response frame if this is given * @from_ap: frame is received from an AP (currently used only * for EHT capabilities parsing) */ struct ieee80211_elems_parse_params { enum ieee80211_conn_mode mode; const u8 *start; size_t len; u8 type; u64 filter; u32 crc; struct cfg80211_bss *bss; int link_id; bool from_ap; }; struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params); static inline struct ieee802_11_elems * ieee802_11_parse_elems(const u8 *start, size_t len, u8 type, struct cfg80211_bss *bss) { struct ieee80211_elems_parse_params params = { .mode = IEEE80211_CONN_MODE_HIGHEST, .start = start, .len = len, .type = type, .bss = bss, .link_id = -1, }; return ieee802_11_parse_elems_full(&params); } extern const int ieee802_1d_to_ac[8]; static inline int ieee80211_ac_from_tid(int tid) { return ieee802_1d_to_ac[tid & 7]; } void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time); unsigned int ieee80211_get_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted); void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted); void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); static inline void ieee80211_stop_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_stop_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, true); } static inline void ieee80211_wake_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_wake_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, true); } static inline void ieee80211_stop_vif_queues_norefcount(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_stop_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, false); } static inline void ieee80211_wake_vif_queues_norefcount(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_wake_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, false); } void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs); void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop); void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop); static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) { /* * It's unsafe to try to do any work during reconfigure flow. * When the flow ends the work will be requeued. */ if (local->in_reconfig) return false; /* * If quiescing is set, we are racing with __ieee80211_suspend. * __ieee80211_suspend flushes the workers after setting quiescing, * and we check quiescing / suspended before enqueuing new workers. * We should abort the worker to avoid the races below. */ if (local->quiescing) return false; /* * We might already be suspended if the following scenario occurs: * __ieee80211_suspend Control path * * if (local->quiescing) * return; * local->quiescing = true; * flush_workqueue(); * queue_work(...); * local->suspended = true; * local->quiescing = false; * worker starts running... */ if (local->suspended) return false; return true; } int ieee80211_txq_setup_flows(struct ieee80211_local *local); void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx); void ieee80211_txq_teardown_flows(struct ieee80211_local *local); void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txq, int tid); void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi); void ieee80211_purge_sta_txqs(struct sta_info *sta); void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi); void ieee80211_wake_txqs(struct tasklet_struct *t); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, const u8 *da, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags); void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); enum { IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1), IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), }; int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags); struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags); u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates); int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, enum ieee80211_smps_mode smps_mode); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link); void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, int link_id); size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset); u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap); u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode); void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata); u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap); int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb); void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); /* element building in SKBs */ int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, u32 basic_rates, u32 masked_rates, u8 element_id); int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn); int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode); int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn); int ieee80211_put_reg_conn(struct sk_buff *skb, enum ieee80211_channel_flags flags); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, struct ieee80211_conn_settings *conn); static inline void ieee80211_chanreq_downgrade(struct ieee80211_chan_req *chanreq, struct ieee80211_conn_settings *conn) { ieee80211_chandef_downgrade(&chanreq->oper, conn); if (WARN_ON(!conn)) return; if (conn->mode < IEEE80211_CONN_MODE_EHT) chanreq->ap.chan = NULL; } bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a, const struct ieee80211_chan_req *b); int __must_check _ieee80211_link_use_channel(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, enum ieee80211_chanctx_mode mode, bool assign_on_failure); static inline int __must_check ieee80211_link_use_channel(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, enum ieee80211_chanctx_mode mode) { return _ieee80211_link_use_channel(link, req, mode, false); } int __must_check ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, enum ieee80211_chanctx_mode mode, bool radar_required); int __must_check ieee80211_link_use_reserved_context(struct ieee80211_link_data *link); void ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link); int __must_check ieee80211_link_change_chanreq(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, u64 *changed); void __ieee80211_link_release_channel(struct ieee80211_link_data *link, bool skip_idle_recalc); void ieee80211_link_release_channel(struct ieee80211_link_data *link); void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link); void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, bool clear); int ieee80211_chanctx_refcount(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); bool ieee80211_is_radar_required(struct ieee80211_local *local, struct cfg80211_scan_request *req); bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, struct cfg80211_scan_request *scan_req, int radio_idx); void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings); void ieee80211_recalc_sb_count(struct ieee80211_sub_if_data *sdata, u64 tsf); void ieee80211_recalc_dtim(struct ieee80211_sub_if_data *sdata, u64 tsf); int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect, int radio_idx); int ieee80211_max_num_channels(struct ieee80211_local *local, int radio_idx); u32 ieee80211_get_radio_mask(struct wiphy *wiphy, struct net_device *dev); void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); /* TDLS */ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, int link_id, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *extra_ies, size_t extra_ies_len); int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper); void ieee80211_tdls_peer_del_work(struct wiphy *wiphy, struct wiphy_work *wk); int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef); void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); void ieee80211_teardown_tdls_peers(struct ieee80211_link_data *link); void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason); void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); const char *ieee80211_get_reason_code_string(u16 reason_code); u16 ieee80211_encode_usf(int val); u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type); extern const struct ethtool_ops ieee80211_ethtool_ops; u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *pubsta, int len, bool ampdu); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else #define debug_noinline #endif void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache); void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache); u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata); void ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, const struct ieee80211_eht_cap_elem *eht_cap_ie_elem, u8 eht_cap_len, struct link_sta_info *link_sta); void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct cfg80211_ttlm_params *params); void ieee80211_process_ttlm_teardown(struct ieee80211_sub_if_data *sdata); void ieee80211_check_wbrf_support(struct ieee80211_local *local); void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable); void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, struct cfg80211_ml_reconf_req *req); void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata); #if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT ieee80211_rx_result ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx); int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, u8 n_partial_subchans); void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used); struct ieee802_11_elems * ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_bss *cbss, int link_id, struct ieee80211_chan_req *chanreq, struct cfg80211_chan_def *ap_chandef, unsigned long *userspace_selectors); #else #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT static #endif #endif /* IEEE80211_I_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2009 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> */ #ifndef _LINUX_INTEGRITY_H #define _LINUX_INTEGRITY_H #include <linux/fs.h> #include <linux/iversion.h> enum integrity_status { INTEGRITY_PASS = 0, INTEGRITY_PASS_IMMUTABLE, INTEGRITY_FAIL, INTEGRITY_FAIL_IMMUTABLE, INTEGRITY_NOLABEL, INTEGRITY_NOXATTRS, INTEGRITY_UNKNOWN, }; #ifdef CONFIG_INTEGRITY extern void __init integrity_load_keys(void); #else static inline void integrity_load_keys(void) { } #endif /* CONFIG_INTEGRITY */ /* An inode's attributes for detection of changes */ struct integrity_inode_attributes { u64 version; /* track inode changes */ unsigned long ino; dev_t dev; }; /* * On stacked filesystems the i_version alone is not enough to detect file data * or metadata change. Additional metadata is required. */ static inline void integrity_inode_attrs_store(struct integrity_inode_attributes *attrs, u64 i_version, const struct inode *inode) { attrs->version = i_version; attrs->dev = inode->i_sb->s_dev; attrs->ino = inode->i_ino; } /* * On stacked filesystems detect whether the inode or its content has changed. */ static inline bool integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs, const struct inode *inode) { return (inode->i_sb->s_dev != attrs->dev || inode->i_ino != attrs->ino || !inode_eq_iversion(inode, attrs->version)); } #endif /* _LINUX_INTEGRITY_H */
2 2 303 301 7 140 140 281 281 99 99 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; switch (ctt->src.l3num) { case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; break; } ft->l3proto = ctt->src.l3num; ft->l4proto = ctt->dst.protonum; switch (ctt->dst.protonum) { case IPPROTO_TCP: case IPPROTO_UDP: ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; break; } } struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); if (!flow) return NULL; refcount_inc(&ct->ct_general.use); flow->ct = ct; flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; } EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { if (flow_tuple->l3proto == NFPROTO_IPV6) return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); return 0; } static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct dst_entry *dst = route->tuple[dir].dst; route->tuple[dir].dst = NULL; return dst; } static int flow_offload_fill_route(struct flow_offload *flow, struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; struct dst_entry *dst = nft_route_dst_fetch(route, dir); int i, j = 0; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } flow_tuple->iifidx = route->tuple[dir].in.ifindex; for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; if (route->tuple[dir].in.ingress_vlans & BIT(i)) flow_tuple->in_vlan_ingress |= BIT(j); j++; } flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, ETH_ALEN); memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: flow_tuple->ifidx = route->tuple[dir].out.ifindex; flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; default: WARN_ON_ONCE(1); break; } flow_tuple->xmit_type = route->tuple[dir].xmit_type; return 0; } static void nft_flow_dst_release(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) dst_release(flow->tuplehash[dir].tuple.dst_cache); } void flow_offload_route_init(struct flow_offload *flow, struct nf_flow_route *route) { flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; } EXPORT_SYMBOL_GPL(flow_offload_route_init); static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return nf_flow_timeout_delta(flow->timeout) <= 0; } static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state) { struct ip_ct_tcp *tcp = &ct->proto.tcp; spin_lock_bh(&ct->lock); if (tcp->state != tcp_state) tcp->state = tcp_state; /* syn packet triggers the TCP reopen case from conntrack. */ if (tcp->state == TCP_CONNTRACK_CLOSE) ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; /* Conntrack state is outdated due to offload bypass. * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks * TCP reset validation will fail. */ tcp->seen[0].td_maxwin = 0; tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; tcp->seen[1].td_maxwin = 0; tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; spin_unlock_bh(&ct->lock); } static void flow_offload_fixup_ct(struct flow_offload *flow) { struct nf_conn *ct = flow->ct; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); bool expired, closing = false; u32 offload_timeout = 0; s32 timeout; if (l4num == IPPROTO_TCP) { const struct nf_tcp_net *tn = nf_tcp_pernet(net); u8 tcp_state; /* Enter CLOSE state if fin/rst packet has been seen, this * allows TCP reopen from conntrack. Otherwise, pick up from * the last seen TCP state. */ closing = test_bit(NF_FLOW_CLOSING, &flow->flags); if (closing) { flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE); timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]); expired = false; } else { tcp_state = READ_ONCE(ct->proto.tcp.state); flow_offload_fixup_tcp(ct, tcp_state); timeout = READ_ONCE(tn->timeouts[tcp_state]); expired = nf_flow_has_expired(flow); } offload_timeout = READ_ONCE(tn->offload_timeout); } else if (l4num == IPPROTO_UDP) { const struct nf_udp_net *tn = nf_udp_pernet(net); enum udp_conntrack state = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? UDP_CT_REPLIED : UDP_CT_UNREPLIED; timeout = READ_ONCE(tn->timeouts[state]); expired = nf_flow_has_expired(flow); offload_timeout = READ_ONCE(tn->offload_timeout); } else { return; } if (expired) timeout -= offload_timeout; if (timeout < 0) timeout = 0; if (closing || nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) nf_ct_refresh(ct, timeout); } static void flow_offload_route_release(struct flow_offload *flow) { nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); } void flow_offload_free(struct flow_offload *flow) { switch (flow->type) { case NF_FLOW_OFFLOAD_ROUTE: flow_offload_route_release(flow); break; default: break; } nf_ct_put(flow->ct); kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple_rhash *tuplehash = data; return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct flow_offload_tuple *tuple = arg->key; const struct flow_offload_tuple_rhash *x = ptr; if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) return 1; return 0; } static const struct rhashtable_params nf_flow_offload_rhash_params = { .head_offset = offsetof(struct flow_offload_tuple_rhash, node), .hashfn = flow_offload_hash, .obj_hashfn = flow_offload_hash_obj, .obj_cmpfn = flow_offload_hash_cmp, .automatic_shrinking = true, }; unsigned long flow_offload_get_timeout(struct flow_offload *flow) { unsigned long timeout = NF_FLOW_TIMEOUT; struct net *net = nf_ct_net(flow->ct); int l4num = nf_ct_protonum(flow->ct); if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); timeout = tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); timeout = tn->offload_timeout; } return timeout; } int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); if (err < 0) return err; err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[1].node, nf_flow_offload_rhash_params); if (err < 0) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); return err; } nf_ct_refresh(flow->ct, NF_CT_DAY); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); } return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow, bool force) { u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); if (force || timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); else return; if (likely(!nf_flowtable_hw_offload(flow_table)) || test_bit(NF_FLOW_CLOSING, &flow->flags)) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags)) flow_offload_fixup_ct(flow); } EXPORT_SYMBOL_GPL(flow_offload_teardown); struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, nf_flow_offload_rhash_params); if (!tuplehash) return NULL; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; } EXPORT_SYMBOL_GPL(flow_offload_lookup); static int nf_flow_table_iterate(struct nf_flowtable *flow_table, void (*iter)(struct nf_flowtable *flowtable, struct flow_offload *flow, void *data), void *data) { struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; int err = 0; rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { if (PTR_ERR(tuplehash) != -EAGAIN) { err = PTR_ERR(tuplehash); break; } continue; } if (tuplehash->tuple.dir) continue; flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); iter(flow_table, flow, data); } rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); return err; } static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, const struct flow_offload *flow) { return flow_table->type->gc && flow_table->type->gc(flow); } /** * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry * @ct: Flowtable offloaded tcp ct * * Return: number of seconds when ct entry should expire. */ static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct) { u8 state = READ_ONCE(ct->proto.tcp.state); switch (state) { case TCP_CONNTRACK_SYN_SENT: case TCP_CONNTRACK_SYN_RECV: return 0; case TCP_CONNTRACK_ESTABLISHED: return NF_CT_DAY; case TCP_CONNTRACK_FIN_WAIT: case TCP_CONNTRACK_CLOSE_WAIT: case TCP_CONNTRACK_LAST_ACK: case TCP_CONNTRACK_TIME_WAIT: return 5 * 60 * HZ; case TCP_CONNTRACK_CLOSE: return 0; } return 0; } /** * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry * @ct: Flowtable offloaded ct * * Datapath lookups in the conntrack table will evict nf_conn entries * if they have expired. * * Once nf_conn entries have been offloaded, nf_conntrack might not see any * packets anymore. Thus ct->timeout is no longer refreshed and ct can * be evicted. * * To avoid the need for an additional check on the offload bit for every * packet processed via nf_conntrack_in(), set an arbitrary timeout large * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT * from the packet path via nf_ct_is_expired(). */ static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct) { static const u32 min_timeout = 5 * 60 * HZ; u32 expires = nf_ct_expires(ct); /* normal case: large enough timeout, nothing to do. */ if (likely(expires >= min_timeout)) return; /* must check offload bit after this, we do not hold any locks. * flowtable and ct entries could have been removed on another CPU. */ if (!refcount_inc_not_zero(&ct->ct_general.use)) return; /* load ct->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (nf_ct_is_confirmed(ct) && test_bit(IPS_OFFLOAD_BIT, &ct->status)) { u8 l4proto = nf_ct_protonum(ct); u32 new_timeout = true; switch (l4proto) { case IPPROTO_UDP: new_timeout = NF_CT_DAY; break; case IPPROTO_TCP: new_timeout = nf_flow_table_tcp_timeout(ct); break; default: WARN_ON_ONCE(1); break; } /* Update to ct->timeout from nf_conntrack happens * without holding ct->lock. * * Use cmpxchg to ensure timeout extension doesn't * happen when we race with conntrack datapath. * * The inverse -- datapath updating ->timeout right * after this -- is fine, datapath is authoritative. */ if (new_timeout) { new_timeout += nfct_time_stamp; cmpxchg(&ct->timeout, expires, new_timeout); } } nf_ct_put(ct); } static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags); if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || nf_flow_custom_gc(flow_table, flow)) { flow_offload_teardown(flow); teardown = true; } else if (!teardown) { nf_flow_table_extend_ct_timeout(flow->ct); } if (teardown) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) flow_offload_del(flow_table, flow); } else { flow_offload_del(flow_table, flow); } } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) && test_bit(NF_FLOW_HW, &flow->flags) && !test_bit(NF_FLOW_HW_DYING, &flow->flags)) { nf_flow_offload_del(flow_table, flow); } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } } void nf_flow_table_gc_run(struct nf_flowtable *flow_table) { nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); } static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; flow_table = container_of(work, struct nf_flowtable, gc_work.work); nf_flow_table_gc_run(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); } static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct udphdr *udph; udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } } static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, u8 protocol, __be16 port, __be16 new_port) { switch (protocol) { case IPPROTO_TCP: nf_flow_nat_port_tcp(skb, thoff, port, new_port); break; case IPPROTO_UDP: nf_flow_nat_port_udp(skb, thoff, port, new_port); break; } } void nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; hdr->source = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; hdr->dest = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_snat_port); void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; hdr->dest = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; hdr->source = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); int nf_flow_table_init(struct nf_flowtable *flowtable) { int err; INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); init_rwsem(&flowtable->flow_block_lock); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); if (err < 0) return err; queue_delayed_work(system_power_efficient_wq, &flowtable->gc_work, HZ); mutex_lock(&flowtable_lock); list_add(&flowtable->list, &flowtables); mutex_unlock(&flowtable_lock); return 0; } EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { struct net_device *dev = data; if (!dev) { flow_offload_teardown(flow); return; } if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) flow_offload_teardown(flow); } void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, struct net_device *dev) { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); nf_flow_table_offload_flush(flowtable); } void nf_flow_table_cleanup(struct net_device *dev) { struct nf_flowtable *flowtable; mutex_lock(&flowtable_lock); list_for_each_entry(flowtable, &flowtables, list) nf_flow_table_gc_cleanup(flowtable, dev); mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { mutex_lock(&flowtable_lock); list_del(&flow_table->list); mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_offload_flush(flow_table); /* ... no more pending work after this stage ... */ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_gc_run(flow_table); nf_flow_table_offload_flush_cleanup(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); static int nf_flow_table_init_net(struct net *net) { net->ft.stat = alloc_percpu(struct nf_flow_table_stat); return net->ft.stat ? 0 : -ENOMEM; } static void nf_flow_table_fini_net(struct net *net) { free_percpu(net->ft.stat); } static int nf_flow_table_pernet_init(struct net *net) { int ret; ret = nf_flow_table_init_net(net); if (ret < 0) return ret; ret = nf_flow_table_init_proc(net); if (ret < 0) goto out_proc; return 0; out_proc: nf_flow_table_fini_net(net); return ret; } static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { nf_flow_table_fini_proc(net); nf_flow_table_fini_net(net); } } static struct pernet_operations nf_flow_table_net_ops = { .init = nf_flow_table_pernet_init, .exit_batch = nf_flow_table_pernet_exit, }; static int __init nf_flow_table_module_init(void) { int ret; ret = register_pernet_subsys(&nf_flow_table_net_ops); if (ret < 0) return ret; ret = nf_flow_table_offload_init(); if (ret) goto out_offload; ret = nf_flow_register_bpf(); if (ret) goto out_bpf; return 0; out_bpf: nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; } static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); unregister_pernet_subsys(&nf_flow_table_net_ops); } module_init(nf_flow_table_module_init); module_exit(nf_flow_table_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("Netfilter flow table module");
59 87 38 1 56 36 6 30 90 90 85 85 59 49 49 41 41 107 80 119 18 1 33 23 67 23 25 67 67 61 38 2 38 38 38 38 48 42 48 16 94 93 40 94 44 44 1 44 5 41 40 18 12 6 26 49 2 46 4 26 94 44 147 111 13 76 1 116 94 44 11 91 116 59 89 97 41 38 38 94 6 20 13 15 15 3 3 91 91 115 117 65 41 82 41 118 5 118 118 68 46 46 68 68 3 38 38 38 1 82 1 21 38 87 3 86 17 13 1 1 5 1 1 1 1 3 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 /* * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "addr.h" #include "group.h" #include "bcast.h" #include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" #include "name_table.h" #include "subscr.h" #define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) #define ADV_IDLE ADV_UNIT #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, MBR_PENDING, MBR_ACTIVE, MBR_RECLAIMING, MBR_REMITTED, MBR_LEAVING }; struct tipc_member { struct rb_node tree_node; struct list_head list; struct list_head small_win; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; u32 port; u32 instance; enum mbr_state state; u16 advertised; u16 window; u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; }; struct tipc_group { struct rb_root members; struct list_head small_win; struct list_head pending; struct list_head active; struct tipc_nlist dests; struct net *net; int subid; u32 type; u32 instance; u32 scope; u32 portid; u16 member_cnt; u16 active_cnt; u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; bool *open; bool loopback; bool events; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); static void tipc_group_open(struct tipc_member *m, bool *wakeup) { *wakeup = false; if (list_empty(&m->small_win)) return; list_del_init(&m->small_win); *m->group->open = true; *wakeup = true; } static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || m->state == MBR_REMITTED) grp->active_cnt--; } static int tipc_group_rcvbuf_limit(struct tipc_group *grp) { int max_active, active_pool, idle_pool; int mcnt = grp->member_cnt + 1; /* Limit simultaneous reception from other members */ max_active = min(mcnt / 8, 64); max_active = max(max_active, 16); grp->max_active = max_active; /* Reserve blocks for active and idle members */ active_pool = max_active * ADV_ACTIVE; idle_pool = (mcnt - max_active) * ADV_IDLE; /* Scale to bytes, considering worst-case truesize/msgsize ratio */ return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; } u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) { return grp->bc_snd_nxt; } static bool tipc_group_is_receiver(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } static bool tipc_group_is_sender(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) { if (!grp->loopback) return grp->portid; return 0; } struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) { u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; grp = kzalloc(sizeof(*grp), GFP_ATOMIC); if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; grp->open = group_is_open; *grp->open = false; filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, filter, &grp->subid)) return grp; kfree(grp); return NULL; } void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); tipc_group_update_member(m, 0); } tipc_node_distr_xmit(net, &xmitq); *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); __skb_queue_purge(&m->deferredq); list_del(&m->list); kfree(m); } tipc_node_distr_xmit(net, &xmitq); tipc_nlist_purge(&grp->dests); tipc_topsrv_kern_unsubscr(net, grp->subid); kfree(grp); } static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; struct tipc_member *m; while (n) { m = container_of(n, struct tipc_member, tree_node); nkey = (u64)m->node << 32 | m->port; if (key < nkey) n = n->rb_left; else if (key > nkey) n = n->rb_right; else return m; } return NULL; } static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, u32 node, u32 port) { struct tipc_member *m; m = tipc_group_find_member(grp, node, port); if (m && tipc_group_is_receiver(m)) return m; return NULL; } static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, u32 node) { struct tipc_member *m; struct rb_node *n; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (m->node == node) return m; } return NULL; } static int tipc_group_add_to_tree(struct tipc_group *grp, struct tipc_member *m) { u64 nkey, key = (u64)m->node << 32 | m->port; struct rb_node **n, *parent = NULL; struct tipc_member *tmp; n = &grp->members.rb_node; while (*n) { tmp = container_of(*n, struct tipc_member, tree_node); parent = *n; tmp = container_of(parent, struct tipc_member, tree_node); nkey = (u64)tmp->node << 32 | tmp->port; if (key < nkey) n = &(*n)->rb_left; else if (key > nkey) n = &(*n)->rb_right; else return -EEXIST; } rb_link_node(&m->tree_node, parent, n); rb_insert_color(&m->tree_node, &grp->members); return 0; } static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, u32 instance, int state) { struct tipc_member *m; int ret; m = kzalloc(sizeof(*m), GFP_ATOMIC); if (!m) return NULL; INIT_LIST_HEAD(&m->list); INIT_LIST_HEAD(&m->small_win); __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; ret = tipc_group_add_to_tree(grp, m); if (ret < 0) { kfree(m); return NULL; } grp->member_cnt++; tipc_nlist_add(&grp->dests, m->node); m->state = state; return m; } void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance) { tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, struct tipc_member *m) { rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; /* Check if we were waiting for replicast ack from this member */ if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) grp->bc_ackers--; list_del_init(&m->list); list_del_init(&m->small_win); tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) tipc_nlist_del(&grp->dests, m->node); kfree(m); } struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) { return &grp->dests; } void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; seq->lower = grp->instance; seq->upper = grp->instance; *scope = grp->scope; } void tipc_group_update_member(struct tipc_member *m, int len) { struct tipc_group *grp = m->group; struct tipc_member *_m, *tmp; if (!tipc_group_is_receiver(m)) return; m->window -= len; if (m->window >= ADV_IDLE) return; list_del_init(&m->small_win); /* Sort member into small_window members' list */ list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) { if (_m->window > m->window) break; } list_add_tail(&m->small_win, &_m->small_win); } void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) { u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_receiver(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) grp->bc_ackers = ackers; grp->bc_snd_nxt++; } bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **mbr) { struct sk_buff_head xmitq; struct tipc_member *m; int adv, state; m = tipc_group_find_dest(grp, dnode, dport); if (!tipc_group_is_receiver(m)) { *mbr = NULL; return false; } *mbr = m; if (m->window >= len) return false; *grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) return true; if (state == MBR_PENDING && adv == ADV_IDLE) return true; __skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); return true; } bool tipc_group_bc_cong(struct tipc_group *grp, int len) { struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ if (grp->bc_ackers) { *grp->open = false; return true; } if (list_empty(&grp->small_win)) return false; m = list_first_entry(&grp->small_win, struct tipc_member, small_win); if (m->window >= len) return false; return tipc_group_cong(grp, m->node, m->port, len, &m); } /* tipc_group_sort_msg() - sort msg into queue by bcast sequence number */ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) { struct tipc_msg *_hdr, *hdr = buf_msg(skb); u16 bc_seqno = msg_grp_bc_seqno(hdr); struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) continue; __skb_queue_before(defq, _skb, skb); return; } /* Bcast was not bypassed, - add to tail */ } /* Unicasts are never bypassed, - always add to tail */ __skb_queue_tail(defq, skb); } /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); bool ack, deliver, update, leave = false; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; u32 node, port; int mtyp, blks; if (!skb) return; hdr = buf_msg(skb); node = msg_orignode(hdr); port = msg_origport(hdr); if (!msg_in_group(hdr)) goto drop; m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_sender(m)) goto drop; if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) goto drop; TIPC_SKB_CB(skb)->orig_member = m->instance; defq = &m->deferredq; tipc_group_sort_msg(skb, defq); while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); blks = msg_blocks(hdr); deliver = true; ack = false; update = false; if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) break; /* Decide what to do with message */ switch (mtyp) { case TIPC_GRP_MCAST_MSG: if (msg_nameinst(hdr) != grp->instance) { update = true; deliver = false; } fallthrough; case TIPC_GRP_BCAST_MSG: m->bc_rcv_nxt++; ack = msg_grp_bc_ack_req(hdr); break; case TIPC_GRP_UCAST_MSG: break; case TIPC_GRP_MEMBER_EVT: if (m->state == MBR_LEAVING) leave = true; if (!grp->events) deliver = false; break; default: break; } /* Execute decisions */ __skb_dequeue(defq); if (deliver) __skb_queue_tail(inputq, skb); else kfree_skb(skb); if (ack) tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { __skb_queue_purge(defq); tipc_group_delete_member(grp, m); break; } if (!update) continue; tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; drop: kfree_skb(skb); } void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq) { struct list_head *active = &grp->active; int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) return; m->advertised -= blks; switch (m->state) { case MBR_JOINED: /* First, decide if member can go active */ if (active_cnt <= max_active) { m->state = MBR_ACTIVE; list_add_tail(&m->list, active); grp->active_cnt++; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } else { m->state = MBR_PENDING; list_add_tail(&m->list, &grp->pending); } if (active_cnt < reclaim_limit) break; /* Reclaim from oldest active member, if possible */ if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); break; } /* Nobody to reclaim from; - revert oldest pending to JOINED */ pm = list_first_entry(&grp->pending, struct tipc_member, list); list_del_init(&pm->list); pm->state = MBR_JOINED; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); if (m->advertised > (ADV_ACTIVE * 3 / 4)) break; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); break; case MBR_REMITTED: if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } if (list_empty(&grp->pending)) return; /* Set oldest pending member to active and advertise */ pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_JOINING: case MBR_LEAVING: default: break; } } static void tipc_group_create_event(struct tipc_group *grp, struct tipc_member *m, u32 event, u16 seqno, struct sk_buff_head *inputq) { u32 dnode = tipc_own_addr(grp->net); struct tipc_event evt; struct sk_buff *skb; struct tipc_msg *hdr; memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; evt.port.ref = m->port; evt.port.node = m->node; evt.s.seq.type = grp->type; evt.s.seq.lower = m->instance; evt.s.seq.upper = m->instance; skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, GROUP_H_SIZE, sizeof(evt), dnode, m->node, grp->portid, m->port, 0); if (!skb) return; hdr = buf_msg(skb); msg_set_nametype(hdr, grp->type); msg_set_grp_evt(hdr, event); msg_set_dest_droppable(hdr, true); msg_set_grp_bc_seqno(hdr, seqno); memcpy(msg_data(hdr), &evt, sizeof(evt)); TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); } static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { struct tipc_msg *hdr; struct sk_buff *skb; int adv = 0; skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, m->node, tipc_own_addr(grp->net), m->port, grp->portid, 0); if (!skb) return; if (m->state == MBR_ACTIVE) adv = ADV_ACTIVE - m->advertised; else if (m->state == MBR_JOINED || m->state == MBR_PENDING) adv = ADV_IDLE - m->advertised; hdr = buf_msg(skb); if (mtyp == GRP_JOIN_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_LEAVE_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_ACK_MSG) { msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; u16 remitted, in_flight; if (!grp) return; if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) return; m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received if necessary */ if (m->state != MBR_PUBLISHED) return; /* Member can be taken into service */ m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); list_del_init(&m->list); tipc_group_open(m, usr_wakeup); tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) return; m->window += msg_adv_win(hdr); tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) return; list_del_init(&m->small_win); *m->group->open = true; *usr_wakeup = true; tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) return; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; m->advertised = ADV_IDLE + in_flight; return; } /* This should never happen */ if (m->advertised < remitted) pr_warn_ratelimited("Unexpected REMIT msg\n"); /* All messages preceding the REMIT have been read */ m->state = MBR_JOINED; grp->active_cnt--; m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) return; pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } } /* tipc_group_member_evt() - receive and handle a member up/down event */ void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; u32 port = evt->port.ref; int event = evt->event; struct tipc_member *m; struct net *net; u32 self; if (!grp) return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) return; m = tipc_group_find_member(grp, node, port); switch (event) { case TIPC_PUBLISHED: /* Send and wait for arrival of JOIN message if necessary */ if (!m) { m = tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); if (!m) break; tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); break; } if (m->state != MBR_JOINING) break; /* Member can be taken into service */ m->instance = instance; m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); break; case TIPC_WITHDRAWN: if (!m) break; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; list_del_init(&m->list); tipc_group_open(m, usr_wakeup); /* Only send event if no LEAVE message can be expected */ if (!tipc_node_is_up(net, node)) tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_rcv_nxt, inputq); break; default: break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) { struct nlattr *group = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_GROUP); if (!group) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, grp->type) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, grp->instance) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, grp->bc_snd_nxt)) goto group_msg_cancel; if (grp->scope == TIPC_NODE_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) goto group_msg_cancel; if (grp->scope == TIPC_CLUSTER_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) goto group_msg_cancel; if (*grp->open) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) goto group_msg_cancel; nla_nest_end(skb, group); return 0; group_msg_cancel: nla_nest_cancel(skb, group); return -1; }
35 6 1 1 22 5 12 15 30 19 20 2 2 2 14 23 4 8 13 21 17 3 1 15 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ #include <linux/slab.h> #include <linux/bpf.h> #include <linux/btf.h> #include "map_in_map.h" struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; CLASS(fd, f)(inner_map_ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) return ERR_PTR(-EINVAL); if (!inner_map->ops->map_meta_equal) return ERR_PTR(-ENOTSUPP); inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) return ERR_PTR(-ENOMEM); inner_map_meta->map_type = inner_map->map_type; inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->record = btf_record_dup(inner_map->record); if (IS_ERR(inner_map_meta->record)) { /* btf_record_dup returns NULL or valid pointer in case of * invalid/empty/valid, but ERR_PTR in case of errors. During * equality NULL or IS_ERR is equivalent. */ struct bpf_map *ret = ERR_CAST(inner_map_meta->record); kfree(inner_map_meta); return ret; } /* Note: We must use the same BTF, as we also used btf_record_dup above * which relies on BTF being same for both maps, as some members like * record->fields.list_head have pointers like value_rec pointing into * inner_map->btf. */ if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; } /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) { struct bpf_array *inner_array_meta = container_of(inner_map_meta, struct bpf_array, map); struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); inner_array_meta->index_mask = inner_array->index_mask; inner_array_meta->elem_size = inner_array->elem_size; inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; } return inner_map_meta; } void bpf_map_meta_free(struct bpf_map *map_meta) { bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); } bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { /* No need to compare ops because it is covered by map_type */ return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->map_flags == meta1->map_flags && btf_record_equal(meta0->record, meta1->record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { struct bpf_map *inner_map, *inner_map_meta; CLASS(fd, f)(ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; inner_map_meta = map->inner_map_meta; if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); return inner_map; } void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_map *inner_map = ptr; /* Defer the freeing of inner map according to the sleepable attribute * of bpf program which owns the outer map, so unnecessary waiting for * RCU tasks trace grace period can be avoided. */ if (need_defer) { if (atomic64_read(&map->sleepable_refcnt)) WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); else WRITE_ONCE(inner_map->free_after_rcu_gp, true); } bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) { return ((struct bpf_map *)ptr)->id; }
92 2393 1523 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef MM_SLAB_H #define MM_SLAB_H #include <linux/reciprocal_div.h> #include <linux/list_lru.h> #include <linux/local_lock.h> #include <linux/random.h> #include <linux/kobject.h> #include <linux/sched/mm.h> #include <linux/memcontrol.h> #include <linux/kfence.h> #include <linux/kasan.h> /* * Internal slab definitions */ #ifdef CONFIG_64BIT # ifdef system_has_cmpxchg128 # define system_has_freelist_aba() system_has_cmpxchg128() # define try_cmpxchg_freelist try_cmpxchg128 # endif #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 typedef u128 freelist_full_t; #else /* CONFIG_64BIT */ # ifdef system_has_cmpxchg64 # define system_has_freelist_aba() system_has_cmpxchg64() # define try_cmpxchg_freelist try_cmpxchg64 # endif #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 typedef u64 freelist_full_t; #endif /* CONFIG_64BIT */ #if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) #undef system_has_freelist_aba #endif /* * Freelist pointer and counter to cmpxchg together, avoids the typical ABA * problems with cmpxchg of just a pointer. */ struct freelist_counters { union { struct { void *freelist; union { unsigned long counters; struct { unsigned inuse:16; unsigned objects:15; /* * If slab debugging is enabled then the * frozen bit can be reused to indicate * that the slab was corrupted */ unsigned frozen:1; }; }; }; #ifdef system_has_freelist_aba freelist_full_t freelist_counters; #endif }; }; /* Reuses the bits in struct page */ struct slab { memdesc_flags_t flags; struct kmem_cache *slab_cache; union { struct { union { struct list_head slab_list; struct { /* For deferred deactivate_slab() */ struct llist_node llnode; void *flush_freelist; }; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; int slabs; /* Nr of slabs left */ }; #endif }; /* Double-word boundary */ struct freelist_counters; }; struct rcu_head rcu_head; }; unsigned int __page_type; atomic_t __page_refcount; #ifdef CONFIG_SLAB_OBJ_EXT unsigned long obj_exts; #endif }; #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG SLAB_MATCH(memcg_data, obj_exts); #elif defined(CONFIG_SLAB_OBJ_EXT) SLAB_MATCH(_unused_slab_obj_exts, obj_exts); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); #if defined(system_has_freelist_aba) static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters))); #endif /** * slab_folio - The folio allocated for a slab * @s: The slab. * * Slabs are allocated as folios that contain the individual objects and are * using some fields in the first struct page of the folio - those fields are * now accessed by struct slab. It is occasionally necessary to convert back to * a folio in order to communicate with the rest of the mm. Please use this * helper function instead of casting yourself, as the implementation may change * in the future. */ #define slab_folio(s) (_Generic((s), \ const struct slab *: (const struct folio *)s, \ struct slab *: (struct folio *)s)) /** * page_slab - Converts from struct page to its slab. * @page: A page which may or may not belong to a slab. * * Return: The slab which contains this page or NULL if the page does * not belong to a slab. This includes pages returned from large kmalloc. */ static inline struct slab *page_slab(const struct page *page) { unsigned long head; head = READ_ONCE(page->compound_head); if (head & 1) page = (struct page *)(head - 1); if (data_race(page->page_type >> 24) != PGTY_slab) page = NULL; return (struct slab *)page; } /** * slab_page - The first struct page allocated for a slab * @s: The slab. * * A convenience wrapper for converting slab to the first struct page of the * underlying folio, to communicate with code not yet converted to folio or * struct slab. */ #define slab_page(s) folio_page(slab_folio(s), 0) static inline void *slab_address(const struct slab *slab) { return folio_address(slab_folio(slab)); } static inline int slab_nid(const struct slab *slab) { return memdesc_nid(slab->flags); } static inline pg_data_t *slab_pgdat(const struct slab *slab) { return NODE_DATA(slab_nid(slab)); } static inline struct slab *virt_to_slab(const void *addr) { return page_slab(virt_to_page(addr)); } static inline int slab_order(const struct slab *slab) { return folio_order(slab_folio(slab)); } static inline size_t slab_size(const struct slab *slab) { return PAGE_SIZE << slab_order(slab); } #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) #define slub_set_percpu_partial(c, p) \ ({ \ slub_percpu_partial(c) = (p)->next; \ }) #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) #else #define slub_percpu_partial(c) NULL #define slub_set_percpu_partial(c, p) #define slub_percpu_partial_read_once(c) NULL #endif // CONFIG_SLUB_CPU_PARTIAL /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the * given order would contain. */ struct kmem_cache_order_objects { unsigned int x; }; /* * Slab cache management. */ struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; struct lock_class_key lock_key; struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; unsigned int size; /* Object size including metadata */ unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ struct kmem_cache_order_objects min; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *object); /* Object constructor */ unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif #ifdef CONFIG_SLAB_FREELIST_HARDENED unsigned long random; #endif #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. */ unsigned int remote_node_defrag_ratio; #endif #ifdef CONFIG_SLAB_FREELIST_RANDOM unsigned int *random_seq; #endif #ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif #ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ #endif struct kmem_cache_node *node[MAX_NUMNODES]; }; #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS 1 void sysfs_slab_unlink(struct kmem_cache *s); void sysfs_slab_release(struct kmem_cache *s); #else static inline void sysfs_slab_unlink(struct kmem_cache *s) { } static inline void sysfs_slab_release(struct kmem_cache *s) { } #endif void *fixup_red_left(struct kmem_cache *s, void *p); static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, void *x) { void *object = x - (x - slab_address(slab)) % cache->size; void *last_object = slab_address(slab) + (slab->objects - 1) * cache->size; void *result = (unlikely(object > last_object)) ? last_object : object; result = fixup_red_left(cache, result); return result; } /* Determine object index from a given position */ static inline unsigned int __obj_to_index(const struct kmem_cache *cache, void *addr, void *obj) { return reciprocal_divide(kasan_reset_tag(obj) - addr, cache->reciprocal_size); } static inline unsigned int obj_to_index(const struct kmem_cache *cache, const struct slab *slab, void *obj) { if (is_kfence_address(obj)) return 0; return __obj_to_index(cache, slab_address(slab), obj); } static inline int objs_per_slab(const struct kmem_cache *cache, const struct slab *slab) { return slab->objects; } /* * State of the slab allocator. * * This is used to describe the states of the allocator during bootup. * Allocators use this to gradually bootstrap themselves. Most allocators * have the problem that the structures used for managing slab caches are * allocated from slab caches themselves. */ enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ }; extern enum slab_state slab_state; /* The slab cache mutex protects the management structures during changes */ extern struct mutex slab_mutex; /* The list of all slab caches on the system */ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { const char *name[NR_KMALLOC_TYPES]; unsigned int size; } kmalloc_info[]; /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(void); extern u8 kmalloc_size_index[24]; static inline unsigned int size_index_elem(unsigned int bytes) { return (bytes - 1) / 8; } /* * Find the kmem_cache structure that serves a given size of * allocation * * This assumes size is larger than zero and not larger than * KMALLOC_MAX_CACHE_SIZE and the caller must check that. */ static inline struct kmem_cache * kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) { unsigned int index; if (!b) b = &kmalloc_caches[kmalloc_type(flags, caller)]; if (size <= 192) index = kmalloc_size_index[size_index_elem(size)]; else index = fls(size - 1); return (*b)[index]; } gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ int do_kmem_cache_create(struct kmem_cache *s, const char *name, unsigned int size, struct kmem_cache_args *args, slab_flags_t flags); void __init kmem_cache_init(void); extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(unsigned size, unsigned align, slab_flags_t flags, const char *name, void (*ctor)(void *)); struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); static inline bool is_kmalloc_cache(struct kmem_cache *s) { return (s->flags & SLAB_KMALLOC); } static inline bool is_kmalloc_normal(struct kmem_cache *s) { if (!is_kmalloc_cache(s)) return false; return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); void flush_all_rcu_sheaves(void); void flush_rcu_sheaves_on_cache(struct kmem_cache *s); #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT | \ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; struct file; struct slabinfo { unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs; unsigned long num_slabs; unsigned long shared_avail; unsigned int limit; unsigned int batchcount; unsigned int shared; unsigned int objects_per_slab; unsigned int cache_order; }; void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); #else DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); #endif extern void print_tracking(struct kmem_cache *s, void *object); long validate_slab_cache(struct kmem_cache *s); static inline bool __slub_debug_enabled(void) { return static_branch_unlikely(&slub_debug_enabled); } #else static inline void print_tracking(struct kmem_cache *s, void *object) { } static inline bool __slub_debug_enabled(void) { return false; } #endif /* * Returns true if any of the specified slab_debug flags is enabled for the * cache. Use only for flags parsed by setup_slub_debug() as it also enables * the static key. */ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) { if (IS_ENABLED(CONFIG_SLUB_DEBUG)) VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); if (__slub_debug_enabled()) return s->flags & flags; return false; } #if IS_ENABLED(CONFIG_SLUB_DEBUG) && IS_ENABLED(CONFIG_KUNIT) bool slab_in_kunit_test(void); #else static inline bool slab_in_kunit_test(void) { return false; } #endif #ifdef CONFIG_SLAB_OBJ_EXT /* * slab_obj_exts - get the pointer to the slab object extension vector * associated with a slab. * @slab: a pointer to the slab struct * * Returns a pointer to the object extension vector associated with the slab, * or NULL if no such vector has been associated yet. */ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { unsigned long obj_exts = READ_ONCE(slab->obj_exts); #ifdef CONFIG_MEMCG /* * obj_exts should be either NULL, a valid pointer with * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL. */ VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) && obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); #endif return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); } int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab); #else /* CONFIG_SLAB_OBJ_EXT */ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { return NULL; } #endif /* CONFIG_SLAB_OBJ_EXT */ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { return (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } #ifdef CONFIG_MEMCG bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p); void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif void kvfree_rcu_cb(struct rcu_head *head); size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object * and whatever may come after it. */ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; #endif if (s->flags & SLAB_KASAN) return s->object_size; /* * If we have the need to store the freelist pointer * back there or track user information then we can * only use the space before that information. */ if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) return s->inuse; /* * Else we can use all the padding etc for the allocation */ return s->size; } static inline unsigned int large_kmalloc_order(const struct page *page) { return page[1].flags.f & 0xff; } static inline size_t large_kmalloc_size(const struct page *page) { return PAGE_SIZE << large_kmalloc_order(page); } #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else static inline void dump_unreclaimable_slab(void) { } #endif void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); #ifdef CONFIG_SLAB_FREELIST_RANDOM int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp); void cache_random_seq_destroy(struct kmem_cache *cachep); #else static inline int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp) { return 0; } static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) { if (c->ctor) return false; if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) return flags & __GFP_ZERO; return true; } return flags & __GFP_ZERO; } static inline bool slab_want_init_on_free(struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free)) return !(c->ctor || (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); return false; } #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) void debugfs_slab_release(struct kmem_cache *); #else static inline void debugfs_slab_release(struct kmem_cache *s) { } #endif #ifdef CONFIG_PRINTK #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; struct slab *kp_slab; void *kp_objp; unsigned long kp_data_offset; struct kmem_cache *kp_slab_cache; void *kp_ret; void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); #endif void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); void defer_free_barrier(void); static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && (s->flags & SLAB_KMALLOC)); } #ifdef CONFIG_SLUB_DEBUG void skip_orig_size_check(struct kmem_cache *s, const void *object); #endif #endif /* MM_SLAB_H */
7 12 12 12 25 25 23 23 1 10 11 2 1 8 6 3 12 3 6 3 4 8 1 8 3 11 1 10 33 23 23 1 22 12 12 12 12 33 29 4 4 33 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/siphash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/red.h> /* Stochastic Fairness Queuing algorithm. ======================================= Source: Paul E. McKenney "Stochastic Fairness Queuing", IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. Paul E. McKenney "Stochastic Fairness Queuing", "Interworking: Research and Experience", v.2, 1991, p.113-131. See also: M. Shreedhar and George Varghese "Efficient Fair Queuing using Deficit Round Robin", Proc. SIGCOMM 95. This is not the thing that is usually called (W)FQ nowadays. It does not use any timestamp mechanism, but instead processes queues in round-robin order. ADVANTAGE: - It is very cheap. Both CPU and memory requirements are minimal. DRAWBACKS: - "Stochastic" -> It is not 100% fair. When hash collisions occur, several flows are considered as one. - "Round-robin" -> It introduces larger delays than virtual clock based schemes, and should not be used for isolating interactive traffic from non-interactive. It means, that this scheduler should be used as leaf of CBQ or P3, which put interactive traffic to higher priority band. We still need true WFQ for top level CSZ, but using WFQ for the best effort traffic is absolutely pointless: SFQ is superior for this purpose. IMPLEMENTATION: This implementation limits : - maximal queue length per flow to 127 packets. - max mtu to 2^18-1; - max 65408 flows, - number of hash buckets to 65536. It is easy to increase these values, but not in flight. */ #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ #define SFQ_DEFAULT_FLOWS 128 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ #define SFQ_EMPTY_SLOT 0xffff #define SFQ_DEFAULT_HASH_DIVISOR 1024 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ typedef u16 sfq_index; /* * We dont use pointers to save space. * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] * are 'pointers' to dep[] array */ struct sfq_head { sfq_index next; sfq_index prev; }; struct sfq_slot { struct sk_buff *skblist_next; struct sk_buff *skblist_prev; sfq_index qlen; /* number of skbs in skblist */ sfq_index next; /* next slot in sfq RR chain */ struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ int allot; /* credit for this slot */ unsigned int backlog; struct red_vars vars; }; struct sfq_sched_data { /* frequently used fields */ int limit; /* limit of total number of packets in this qdisc */ unsigned int divisor; /* number of slots in hash table */ u8 headdrop; u8 maxdepth; /* limit of packets per flow */ siphash_key_t perturbation; u8 cur_depth; /* depth of longest slot */ u8 flags; struct tcf_proto __rcu *filter_list; struct tcf_block *block; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ struct red_parms *red_parms; struct tc_sfqred_stats stats; struct sfq_slot *tail; /* current slot in round */ struct sfq_head dep[SFQ_MAX_DEPTH + 1]; /* Linked lists of slots, indexed by depth * dep[0] : list of unused flows * dep[1] : list of flows with 1 packet * dep[X] : list of flows with X packets */ unsigned int maxflows; /* number of flows in flows array */ int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; struct Qdisc *sch; }; /* * sfq_head are either in a sfq_slot or in dep[] array */ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) { if (val < SFQ_MAX_FLOWS) return &q->slots[val].dep; return &q->dep[val - SFQ_MAX_FLOWS]; } static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct sfq_sched_data *q = qdisc_priv(sch); struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); if (!fl) return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return 0; } #endif if (TC_H_MIN(res.classid) <= q->divisor) return TC_H_MIN(res.classid); } return 0; } /* * x : slot number [0 .. SFQ_MAX_FLOWS - 1] */ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; struct sfq_slot *slot = &q->slots[x]; int qlen = slot->qlen; p = qlen + SFQ_MAX_FLOWS; n = q->dep[qlen].next; slot->dep.next = n; slot->dep.prev = p; q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ sfq_dep_head(q, n)->prev = x; } #define sfq_unlink(q, x, n, p) \ do { \ n = q->slots[x].dep.next; \ p = q->slots[x].dep.prev; \ sfq_dep_head(q, p)->next = n; \ sfq_dep_head(q, n)->prev = p; \ } while (0) static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; int d; sfq_unlink(q, x, n, p); d = q->slots[x].qlen--; if (n == p && q->cur_depth == d) q->cur_depth--; sfq_link(q, x); } static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; int d; sfq_unlink(q, x, n, p); d = ++q->slots[x].qlen; if (q->cur_depth < d) q->cur_depth = d; sfq_link(q, x); } /* helper functions : might be changed when/if skb use a standard list_head */ /* remove one skb from tail of slot queue */ static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) { struct sk_buff *skb = slot->skblist_prev; slot->skblist_prev = skb->prev; skb->prev->next = (struct sk_buff *)slot; skb->next = skb->prev = NULL; return skb; } /* remove one skb from head of slot queue */ static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) { struct sk_buff *skb = slot->skblist_next; slot->skblist_next = skb->next; skb->next->prev = (struct sk_buff *)slot; skb->next = skb->prev = NULL; return skb; } static inline void slot_queue_init(struct sfq_slot *slot) { memset(slot, 0, sizeof(*slot)); slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; } /* add skb to slot queue (tail add) */ static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) { skb->prev = slot->skblist_prev; skb->next = (struct sk_buff *)slot; slot->skblist_prev->next = skb; slot->skblist_prev = skb; } static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); sfq_index x, d = q->cur_depth; struct sk_buff *skb; unsigned int len; struct sfq_slot *slot; /* Queue is full! Find the longest slot and drop tail packet from it */ if (d > 1) { x = q->dep[d].next; slot = &q->slots[x]; drop: skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); slot->backlog -= len; sfq_dec(q, x); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); return len; } if (d == 1) { /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; if (slot->next == x) q->tail = NULL; /* no more active slots */ else q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } return 0; } /* Is ECN parameter configured */ static int sfq_prob_mark(const struct sfq_sched_data *q) { return q->flags & TC_RED_ECN; } /* Should packets over max threshold just be marked */ static int sfq_hard_mark(const struct sfq_sched_data *q) { return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; } static int sfq_headdrop(const struct sfq_sched_data *q) { return q->headdrop; } static int sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; int ret; struct sk_buff *head; int delta; hash = sfq_classify(skb, sch, &ret); if (hash == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } hash--; x = q->ht[hash]; slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) return qdisc_drop(skb, sch, to_free); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; slot->backlog = 0; /* should already be 0 anyway... */ red_set_vars(&slot->vars); goto enqueue; } if (q->red_parms) { slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, &slot->vars, slot->backlog); switch (red_action(q->red_parms, &slot->vars, slot->vars.qavg)) { case RED_DONT_MARK: break; case RED_PROB_MARK: qdisc_qstats_overlimit(sch); if (sfq_prob_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && INET_ECN_set_ce(slot->skblist_next)) { q->stats.prob_mark_head++; break; } if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; break; } } q->stats.prob_drop++; goto congestion_drop; case RED_HARD_MARK: qdisc_qstats_overlimit(sch); if (sfq_hard_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && INET_ECN_set_ce(slot->skblist_next)) { q->stats.forced_mark_head++; break; } if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; break; } } q->stats.forced_drop++; goto congestion_drop; } } if (slot->qlen >= q->maxdepth) { congestion_drop: if (!sfq_headdrop(q)) return qdisc_drop(skb, sch, to_free); /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; slot->backlog -= delta; qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); qdisc_tree_reduce_backlog(sch, 0, delta); return NET_XMIT_CN; } enqueue: qdisc_qstats_backlog_inc(sch, skb); slot->backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; } else { slot->next = q->tail->next; q->tail->next = x; } /* We put this flow at the end of our flow list. * This might sound unfair for a new flow to wait after old ones, * but we could endup servicing new flows only, and freeze old ones. */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ slot->allot = q->quantum; } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; qlen = slot->qlen; dropped = sfq_drop(sch, to_free); /* Return Congestion Notification only if we dropped a packet * from this flow. */ if (qlen != slot->qlen) { qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); return NET_XMIT_CN; } /* As we dropped a packet, better let upper stack know this */ qdisc_tree_reduce_backlog(sch, 1, dropped); return NET_XMIT_SUCCESS; } static struct sk_buff * sfq_dequeue(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; sfq_index a, next_a; struct sfq_slot *slot; /* No active slots */ if (q->tail == NULL) return NULL; next_slot: a = q->tail->next; slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; slot->allot += q->quantum; goto next_slot; } skb = slot_dequeue_head(slot); sfq_dec(q, a); qdisc_bstats_update(sch, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ if (slot->qlen == 0) { q->ht[slot->hash] = SFQ_EMPTY_SLOT; next_a = slot->next; if (a == next_a) { q->tail = NULL; /* no more active slots */ return skb; } q->tail->next = next_a; } else { slot->allot -= qdisc_pkt_len(skb); } return skb; } static void sfq_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = sfq_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } /* * When q->perturbation is changed, we rehash all queued skbs * to avoid OOO (Out Of Order) effects. * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change * counters. */ static void sfq_rehash(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; int i; struct sfq_slot *slot; struct sk_buff_head list; int dropped = 0; unsigned int drop_len = 0; __skb_queue_head_init(&list); for (i = 0; i < q->maxflows; i++) { slot = &q->slots[i]; if (!slot->qlen) continue; while (slot->qlen) { skb = slot_dequeue_head(slot); sfq_dec(q, i); __skb_queue_tail(&list, skb); } slot->backlog = 0; red_set_vars(&slot->vars); q->ht[slot->hash] = SFQ_EMPTY_SLOT; } q->tail = NULL; while ((skb = __skb_dequeue(&list)) != NULL) { unsigned int hash = sfq_hash(q, skb); sfq_index x = q->ht[hash]; slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) { drop: qdisc_qstats_backlog_dec(sch, skb); drop_len += qdisc_pkt_len(skb); kfree_skb(skb); dropped++; continue; } q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; } if (slot->qlen >= q->maxdepth) goto drop; slot_queue_add(slot, skb); if (q->red_parms) slot->vars.qavg = red_calc_qavg(q->red_parms, &slot->vars, slot->backlog); slot->backlog += qdisc_pkt_len(skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; } else { slot->next = q->tail->next; q->tail->next = x; } q->tail = slot; slot->allot = q->quantum; } } sch->q.qlen -= dropped; qdisc_tree_reduce_backlog(sch, dropped, drop_len); } static void sfq_perturbation(struct timer_list *t) { struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; int period; get_random_bytes(&nkey, sizeof(nkey)); rcu_read_lock(); root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); q->perturbation = nkey; if (!q->filter_list && q->tail) sfq_rehash(sch); spin_unlock(root_lock); /* q->perturb_period can change under us from * sfq_change() and sfq_destroy(). */ period = READ_ONCE(q->perturb_period); if (period) mod_timer(&q->perturb_timer, jiffies + period); rcu_read_unlock(); } static int sfq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); struct tc_sfq_qopt_v1 *ctl_v1 = NULL; unsigned int qlen, dropped = 0; struct red_parms *p = NULL; struct sk_buff *to_free = NULL; struct sk_buff *tail = NULL; unsigned int maxflows; unsigned int quantum; unsigned int divisor; int perturb_period; u8 headdrop; u8 maxdepth; int limit; u8 flags; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) ctl_v1 = nla_data(opt); if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; if ((int)ctl->quantum < 0) { NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); return -EINVAL; } if (ctl->perturb_period < 0 || ctl->perturb_period > INT_MAX / HZ) { NL_SET_ERR_MSG_MOD(extack, "invalid perturb period"); return -EINVAL; } perturb_period = ctl->perturb_period * HZ; if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; } sch_tree_lock(sch); limit = q->limit; divisor = q->divisor; headdrop = q->headdrop; maxdepth = q->maxdepth; maxflows = q->maxflows; quantum = q->quantum; flags = q->flags; /* update and validate configuration */ if (ctl->quantum) quantum = ctl->quantum; if (ctl->flows) maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { divisor = ctl->divisor; maxflows = min_t(u32, maxflows, divisor); } if (ctl_v1) { if (ctl_v1->depth) maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); if (p) { red_set_parms(p, ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Plog, ctl_v1->Scell_log, NULL, ctl_v1->max_P); } flags = ctl_v1->flags; headdrop = ctl_v1->headdrop; } if (ctl->limit) { limit = min_t(u32, ctl->limit, maxdepth * maxflows); maxflows = min_t(u32, maxflows, limit); } if (limit == 1) { sch_tree_unlock(sch); kfree(p); NL_SET_ERR_MSG_MOD(extack, "invalid limit"); return -EINVAL; } /* commit configuration */ q->limit = limit; q->divisor = divisor; q->headdrop = headdrop; q->maxdepth = maxdepth; q->maxflows = maxflows; WRITE_ONCE(q->perturb_period, perturb_period); q->quantum = quantum; q->flags = flags; if (p) swap(q->red_parms, p); qlen = sch->q.qlen; while (sch->q.qlen > q->limit) { dropped += sfq_drop(sch, &to_free); if (!tail) tail = to_free; } rtnl_kfree_skbs(to_free, tail); qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); timer_delete(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); } sch_tree_unlock(sch); kfree(p); return 0; } static void *sfq_alloc(size_t sz) { return kvmalloc(sz, GFP_KERNEL); } static void sfq_free(void *addr) { kvfree(addr); } static void sfq_destroy(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); WRITE_ONCE(q->perturb_period, 0); timer_delete_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); kfree(q->red_parms); } static int sfq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); int i; int err; q->sch = sch; timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { q->dep[i].next = i + SFQ_MAX_FLOWS; q->dep[i].prev = i + SFQ_MAX_FLOWS; } q->limit = SFQ_MAX_DEPTH; q->maxdepth = SFQ_MAX_DEPTH; q->cur_depth = 0; q->tail = NULL; q->divisor = SFQ_DEFAULT_HASH_DIVISOR; q->maxflows = SFQ_DEFAULT_FLOWS; q->quantum = psched_mtu(qdisc_dev(sch)); q->perturb_period = 0; get_random_bytes(&q->perturbation, sizeof(q->perturbation)); if (opt) { int err = sfq_change(sch, opt, extack); if (err) return err; } q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); if (!q->ht || !q->slots) { /* Note: sfq_destroy() will be called by our caller */ return -ENOMEM; } for (i = 0; i < q->divisor; i++) q->ht[i] = SFQ_EMPTY_SLOT; for (i = 0; i < q->maxflows; i++) { slot_queue_init(&q->slots[i]); sfq_link(q, i); } if (q->limit >= 1) sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_sfq_qopt_v1 opt; struct red_parms *p = q->red_parms; memset(&opt, 0, sizeof(opt)); opt.v0.quantum = q->quantum; opt.v0.perturb_period = q->perturb_period / HZ; opt.v0.limit = q->limit; opt.v0.divisor = q->divisor; opt.v0.flows = q->maxflows; opt.depth = q->maxdepth; opt.headdrop = q->headdrop; if (p) { opt.qth_min = p->qth_min >> p->Wlog; opt.qth_max = p->qth_max >> p->Wlog; opt.Wlog = p->Wlog; opt.Plog = p->Plog; opt.Scell_log = p->Scell_log; opt.max_P = p->max_P; } memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); opt.flags = q->flags; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long sfq_find(struct Qdisc *sch, u32 classid) { return 0; } static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static void sfq_unbind(struct Qdisc *q, unsigned long cl) { } static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct sfq_sched_data *q = qdisc_priv(sch); sfq_index idx = q->ht[cl - 1]; struct gnet_stats_queue qs = { 0 }; struct tc_sfq_xstats xstats = { 0 }; if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; xstats.allot = slot->allot; qs.qlen = slot->qlen; qs.backlog = slot->backlog; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int i; if (arg->stop) return; for (i = 0; i < q->divisor; i++) { if (q->ht[i] == SFQ_EMPTY_SLOT) { arg->count++; continue; } if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static const struct Qdisc_class_ops sfq_class_ops = { .leaf = sfq_leaf, .find = sfq_find, .tcf_block = sfq_tcf_block, .bind_tcf = sfq_bind, .unbind_tcf = sfq_unbind, .dump = sfq_dump_class, .dump_stats = sfq_dump_class_stats, .walk = sfq_walk, }; static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .cl_ops = &sfq_class_ops, .id = "sfq", .priv_size = sizeof(struct sfq_sched_data), .enqueue = sfq_enqueue, .dequeue = sfq_dequeue, .peek = qdisc_peek_dequeued, .init = sfq_init, .reset = sfq_reset, .destroy = sfq_destroy, .change = NULL, .dump = sfq_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("sfq"); static int __init sfq_module_init(void) { return register_qdisc(&sfq_qdisc_ops); } static void __exit sfq_module_exit(void) { unregister_qdisc(&sfq_qdisc_ops); } module_init(sfq_module_init) module_exit(sfq_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Stochastic Fairness qdisc");
27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2014 Felix Fietkau <nbd@nbd.name> * Copyright (C) 2004 - 2009 Ivo van Doorn <IvDoorn@gmail.com> */ #ifndef _LINUX_BITFIELD_H #define _LINUX_BITFIELD_H #include <linux/build_bug.h> #include <linux/typecheck.h> #include <asm/byteorder.h> /* * Bitfield access macros * * FIELD_{GET,PREP} macros take as first parameter shifted mask * from which they extract the base mask and shift amount. * Mask must be a compilation time constant. * field_{get,prep} are variants that take a non-const mask. * * Example: * * #include <linux/bitfield.h> * #include <linux/bits.h> * * #define REG_FIELD_A GENMASK(6, 0) * #define REG_FIELD_B BIT(7) * #define REG_FIELD_C GENMASK(15, 8) * #define REG_FIELD_D GENMASK(31, 16) * * Get: * a = FIELD_GET(REG_FIELD_A, reg); * b = FIELD_GET(REG_FIELD_B, reg); * * Set: * reg = FIELD_PREP(REG_FIELD_A, 1) | * FIELD_PREP(REG_FIELD_B, 0) | * FIELD_PREP(REG_FIELD_C, c) | * FIELD_PREP(REG_FIELD_D, 0x40); * * Modify: * FIELD_MODIFY(REG_FIELD_C, &reg, c); */ #define __bf_shf(x) (__builtin_ffsll(x) - 1) #define __scalar_type_to_unsigned_cases(type) \ unsigned type: (unsigned type)0, \ signed type: (unsigned type)0 #define __unsigned_scalar_typeof(x) typeof( \ _Generic((x), \ char: (unsigned char)0, \ __scalar_type_to_unsigned_cases(char), \ __scalar_type_to_unsigned_cases(short), \ __scalar_type_to_unsigned_cases(int), \ __scalar_type_to_unsigned_cases(long), \ __scalar_type_to_unsigned_cases(long long), \ default: (x))) #define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x)) #define __BF_FIELD_CHECK_MASK(_mask, _val, _pfx) \ ({ \ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ _pfx "mask is not constant"); \ BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ ~((_mask) >> __bf_shf(_mask)) & \ (0 + (_val)) : 0, \ _pfx "value too large for the field"); \ __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ (1ULL << __bf_shf(_mask))); \ }) #define __BF_FIELD_CHECK_REG(mask, reg, pfx) \ BUILD_BUG_ON_MSG(__bf_cast_unsigned(mask, mask) > \ __bf_cast_unsigned(reg, ~0ull), \ pfx "type of reg too small for mask") #define __BF_FIELD_CHECK(mask, reg, val, pfx) \ ({ \ __BF_FIELD_CHECK_MASK(mask, val, pfx); \ __BF_FIELD_CHECK_REG(mask, reg, pfx); \ }) #define __FIELD_PREP(mask, val, pfx) \ ({ \ __BF_FIELD_CHECK_MASK(mask, val, pfx); \ ((typeof(mask))(val) << __bf_shf(mask)) & (mask); \ }) #define __FIELD_GET(mask, reg, pfx) \ ({ \ __BF_FIELD_CHECK_MASK(mask, 0U, pfx); \ (typeof(mask))(((reg) & (mask)) >> __bf_shf(mask)); \ }) /** * FIELD_MAX() - produce the maximum value representable by a field * @_mask: shifted mask defining the field's length and position * * FIELD_MAX() returns the maximum value that can be held in the field * specified by @_mask. */ #define FIELD_MAX(_mask) \ ({ \ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: "); \ (typeof(_mask))((_mask) >> __bf_shf(_mask)); \ }) /** * FIELD_FIT() - check if value fits in the field * @_mask: shifted mask defining the field's length and position * @_val: value to test against the field * * Return: true if @_val can fit inside @_mask, false if @_val is too big. */ #define FIELD_FIT(_mask, _val) \ ({ \ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \ !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \ }) /** * FIELD_PREP() - prepare a bitfield element * @_mask: shifted mask defining the field's length and position * @_val: value to put in the field * * FIELD_PREP() masks and shifts up the value. The result should * be combined with other fields of the bitfield using logical OR. */ #define FIELD_PREP(_mask, _val) \ ({ \ __BF_FIELD_CHECK_REG(_mask, 0ULL, "FIELD_PREP: "); \ __FIELD_PREP(_mask, _val, "FIELD_PREP: "); \ }) #define __BF_CHECK_POW2(n) BUILD_BUG_ON_ZERO(((n) & ((n) - 1)) != 0) /** * FIELD_PREP_CONST() - prepare a constant bitfield element * @_mask: shifted mask defining the field's length and position * @_val: value to put in the field * * FIELD_PREP_CONST() masks and shifts up the value. The result should * be combined with other fields of the bitfield using logical OR. * * Unlike FIELD_PREP() this is a constant expression and can therefore * be used in initializers. Error checking is less comfortable for this * version, and non-constant masks cannot be used. */ #define FIELD_PREP_CONST(_mask, _val) \ ( \ /* mask must be non-zero */ \ BUILD_BUG_ON_ZERO((_mask) == 0) + \ /* check if value fits */ \ BUILD_BUG_ON_ZERO(~((_mask) >> __bf_shf(_mask)) & (_val)) + \ /* check if mask is contiguous */ \ __BF_CHECK_POW2((_mask) + (1ULL << __bf_shf(_mask))) + \ /* and create the value */ \ (((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask)) \ ) /** * FIELD_GET() - extract a bitfield element * @_mask: shifted mask defining the field's length and position * @_reg: value of entire bitfield * * FIELD_GET() extracts the field specified by @_mask from the * bitfield passed in as @_reg by masking and shifting it down. */ #define FIELD_GET(_mask, _reg) \ ({ \ __BF_FIELD_CHECK_REG(_mask, _reg, "FIELD_GET: "); \ __FIELD_GET(_mask, _reg, "FIELD_GET: "); \ }) /** * FIELD_MODIFY() - modify a bitfield element * @_mask: shifted mask defining the field's length and position * @_reg_p: pointer to the memory that should be updated * @_val: value to store in the bitfield * * FIELD_MODIFY() modifies the set of bits in @_reg_p specified by @_mask, * by replacing them with the bitfield value passed in as @_val. */ #define FIELD_MODIFY(_mask, _reg_p, _val) \ ({ \ typecheck_pointer(_reg_p); \ __BF_FIELD_CHECK(_mask, *(_reg_p), _val, "FIELD_MODIFY: "); \ *(_reg_p) &= ~(_mask); \ *(_reg_p) |= (((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask)); \ }) extern void __compiletime_error("value doesn't fit into mask") __field_overflow(void); extern void __compiletime_error("bad bitfield mask") __bad_mask(void); static __always_inline u64 field_multiplier(u64 field) { if ((field | (field - 1)) & ((field | (field - 1)) + 1)) __bad_mask(); return field & -field; } static __always_inline u64 field_mask(u64 field) { return field / field_multiplier(field); } #define field_max(field) ((typeof(field))field_mask(field)) #define ____MAKE_OP(type,base,to,from) \ static __always_inline __##type __must_check type##_encode_bits(base v, base field) \ { \ if (__builtin_constant_p(v) && (v & ~field_mask(field))) \ __field_overflow(); \ return to((v & field_mask(field)) * field_multiplier(field)); \ } \ static __always_inline __##type __must_check type##_replace_bits(__##type old, \ base val, base field) \ { \ return (old & ~to(field)) | type##_encode_bits(val, field); \ } \ static __always_inline void type##p_replace_bits(__##type *p, \ base val, base field) \ { \ *p = (*p & ~to(field)) | type##_encode_bits(val, field); \ } \ static __always_inline base __must_check type##_get_bits(__##type v, base field) \ { \ return (from(v) & field)/field_multiplier(field); \ } #define __MAKE_OP(size) \ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \ ____MAKE_OP(u##size,u##size,,) ____MAKE_OP(u8,u8,,) __MAKE_OP(16) __MAKE_OP(32) __MAKE_OP(64) #undef __MAKE_OP #undef ____MAKE_OP #define __field_prep(mask, val) \ ({ \ __auto_type __mask = (mask); \ typeof(__mask) __val = (val); \ unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ? \ __ffs(__mask) : __ffs64(__mask); \ (__val << __shift) & __mask; \ }) #define __field_get(mask, reg) \ ({ \ __auto_type __mask = (mask); \ typeof(__mask) __reg = (reg); \ unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ? \ __ffs(__mask) : __ffs64(__mask); \ (__reg & __mask) >> __shift; \ }) /** * field_prep() - prepare a bitfield element * @mask: shifted mask defining the field's length and position, must be * non-zero * @val: value to put in the field * * Return: field value masked and shifted to its final destination * * field_prep() masks and shifts up the value. The result should be * combined with other fields of the bitfield using logical OR. * Unlike FIELD_PREP(), @mask is not limited to a compile-time constant. * Typical usage patterns are a value stored in a table, or calculated by * shifting a constant by a variable number of bits. * If you want to ensure that @mask is a compile-time constant, please use * FIELD_PREP() directly instead. */ #define field_prep(mask, val) \ (__builtin_constant_p(mask) ? __FIELD_PREP(mask, val, "field_prep: ") \ : __field_prep(mask, val)) /** * field_get() - extract a bitfield element * @mask: shifted mask defining the field's length and position, must be * non-zero * @reg: value of entire bitfield * * Return: extracted field value * * field_get() extracts the field specified by @mask from the * bitfield passed in as @reg by masking and shifting it down. * Unlike FIELD_GET(), @mask is not limited to a compile-time constant. * Typical usage patterns are a value stored in a table, or calculated by * shifting a constant by a variable number of bits. * If you want to ensure that @mask is a compile-time constant, please use * FIELD_GET() directly instead. */ #define field_get(mask, reg) \ (__builtin_constant_p(mask) ? __FIELD_GET(mask, reg, "field_get: ") \ : __field_get(mask, reg)) #endif
4 4 4 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API support for SHA-1 and HMAC-SHA1 * * Copyright (c) Alan Smithee. * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> * Copyright 2025 Google LLC */ #include <crypto/internal/hash.h> #include <crypto/sha1.h> #include <linux/kernel.h> #include <linux/module.h> /* * Export and import functions. crypto_shash wants a particular format that * matches that used by some legacy drivers. It currently is the same as the * library SHA context, except the value in bytecount must be block-aligned and * the remainder must be stored in an extra u8 appended to the struct. */ #define SHA1_SHASH_STATE_SIZE (sizeof(struct sha1_ctx) + 1) static_assert(sizeof(struct sha1_ctx) == sizeof(struct sha1_state)); static_assert(offsetof(struct sha1_ctx, state) == offsetof(struct sha1_state, state)); static_assert(offsetof(struct sha1_ctx, bytecount) == offsetof(struct sha1_state, count)); static_assert(offsetof(struct sha1_ctx, buf) == offsetof(struct sha1_state, buffer)); static int __crypto_sha1_export(const struct sha1_ctx *ctx0, void *out) { struct sha1_ctx ctx = *ctx0; unsigned int partial; u8 *p = out; partial = ctx.bytecount % SHA1_BLOCK_SIZE; ctx.bytecount -= partial; memcpy(p, &ctx, sizeof(ctx)); p += sizeof(ctx); *p = partial; return 0; } static int __crypto_sha1_import(struct sha1_ctx *ctx, const void *in) { const u8 *p = in; memcpy(ctx, p, sizeof(*ctx)); p += sizeof(*ctx); ctx->bytecount += *p; return 0; } static int __crypto_sha1_export_core(const struct sha1_ctx *ctx, void *out) { memcpy(out, ctx, offsetof(struct sha1_ctx, buf)); return 0; } static int __crypto_sha1_import_core(struct sha1_ctx *ctx, const void *in) { memcpy(ctx, in, offsetof(struct sha1_ctx, buf)); return 0; } const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, 0xaf, 0xd8, 0x07, 0x09 }; EXPORT_SYMBOL_GPL(sha1_zero_message_hash); #define SHA1_CTX(desc) ((struct sha1_ctx *)shash_desc_ctx(desc)) static int crypto_sha1_init(struct shash_desc *desc) { sha1_init(SHA1_CTX(desc)); return 0; } static int crypto_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha1_update(SHA1_CTX(desc), data, len); return 0; } static int crypto_sha1_final(struct shash_desc *desc, u8 *out) { sha1_final(SHA1_CTX(desc), out); return 0; } static int crypto_sha1_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha1(data, len, out); return 0; } static int crypto_sha1_export(struct shash_desc *desc, void *out) { return __crypto_sha1_export(SHA1_CTX(desc), out); } static int crypto_sha1_import(struct shash_desc *desc, const void *in) { return __crypto_sha1_import(SHA1_CTX(desc), in); } static int crypto_sha1_export_core(struct shash_desc *desc, void *out) { return __crypto_sha1_export_core(SHA1_CTX(desc), out); } static int crypto_sha1_import_core(struct shash_desc *desc, const void *in) { return __crypto_sha1_import_core(SHA1_CTX(desc), in); } #define HMAC_SHA1_KEY(tfm) ((struct hmac_sha1_key *)crypto_shash_ctx(tfm)) #define HMAC_SHA1_CTX(desc) ((struct hmac_sha1_ctx *)shash_desc_ctx(desc)) static int crypto_hmac_sha1_setkey(struct crypto_shash *tfm, const u8 *raw_key, unsigned int keylen) { hmac_sha1_preparekey(HMAC_SHA1_KEY(tfm), raw_key, keylen); return 0; } static int crypto_hmac_sha1_init(struct shash_desc *desc) { hmac_sha1_init(HMAC_SHA1_CTX(desc), HMAC_SHA1_KEY(desc->tfm)); return 0; } static int crypto_hmac_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { hmac_sha1_update(HMAC_SHA1_CTX(desc), data, len); return 0; } static int crypto_hmac_sha1_final(struct shash_desc *desc, u8 *out) { hmac_sha1_final(HMAC_SHA1_CTX(desc), out); return 0; } static int crypto_hmac_sha1_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { hmac_sha1(HMAC_SHA1_KEY(desc->tfm), data, len, out); return 0; } static int crypto_hmac_sha1_export(struct shash_desc *desc, void *out) { return __crypto_sha1_export(&HMAC_SHA1_CTX(desc)->sha_ctx, out); } static int crypto_hmac_sha1_import(struct shash_desc *desc, const void *in) { struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc); ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate; return __crypto_sha1_import(&ctx->sha_ctx, in); } static int crypto_hmac_sha1_export_core(struct shash_desc *desc, void *out) { return __crypto_sha1_export_core(&HMAC_SHA1_CTX(desc)->sha_ctx, out); } static int crypto_hmac_sha1_import_core(struct shash_desc *desc, const void *in) { struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc); ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate; return __crypto_sha1_import_core(&ctx->sha_ctx, in); } static struct shash_alg algs[] = { { .base.cra_name = "sha1", .base.cra_driver_name = "sha1-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA1_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA1_DIGEST_SIZE, .init = crypto_sha1_init, .update = crypto_sha1_update, .final = crypto_sha1_final, .digest = crypto_sha1_digest, .export = crypto_sha1_export, .import = crypto_sha1_import, .export_core = crypto_sha1_export_core, .import_core = crypto_sha1_import_core, .descsize = sizeof(struct sha1_ctx), .statesize = SHA1_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha1)", .base.cra_driver_name = "hmac-sha1-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA1_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct hmac_sha1_key), .base.cra_module = THIS_MODULE, .digestsize = SHA1_DIGEST_SIZE, .setkey = crypto_hmac_sha1_setkey, .init = crypto_hmac_sha1_init, .update = crypto_hmac_sha1_update, .final = crypto_hmac_sha1_final, .digest = crypto_hmac_sha1_digest, .export = crypto_hmac_sha1_export, .import = crypto_hmac_sha1_import, .export_core = crypto_hmac_sha1_export_core, .import_core = crypto_hmac_sha1_import_core, .descsize = sizeof(struct hmac_sha1_ctx), .statesize = SHA1_SHASH_STATE_SIZE, }, }; static int __init crypto_sha1_mod_init(void) { return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } module_init(crypto_sha1_mod_init); static void __exit crypto_sha1_mod_exit(void) { crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_exit(crypto_sha1_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto API support for SHA-1 and HMAC-SHA1"); MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-lib"); MODULE_ALIAS_CRYPTO("hmac(sha1)"); MODULE_ALIAS_CRYPTO("hmac-sha1-lib");
2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 // SPDX-License-Identifier: GPL-2.0-or-later /* * CTR: Counter mode * * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/algapi.h> #include <crypto/ctr.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> struct crypto_rfc3686_ctx { struct crypto_skcipher *child; u8 nonce[CTR_RFC3686_NONCE_SIZE]; }; struct crypto_rfc3686_req_ctx { u8 iv[CTR_RFC3686_BLOCK_SIZE]; struct skcipher_request subreq CRYPTO_MINALIGN_ATTR; }; static void crypto_ctr_crypt_final(struct skcipher_walk *walk, struct crypto_cipher *tfm) { unsigned int bsize = crypto_cipher_blocksize(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); u8 *ctrblk = walk->iv; u8 tmp[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *keystream = PTR_ALIGN(tmp + 0, alignmask + 1); const u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; crypto_cipher_encrypt_one(tfm, keystream, ctrblk); crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, bsize); } static int crypto_ctr_crypt_segment(struct skcipher_walk *walk, struct crypto_cipher *tfm) { void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = crypto_cipher_alg(tfm)->cia_encrypt; unsigned int bsize = crypto_cipher_blocksize(tfm); u8 *ctrblk = walk->iv; const u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; do { /* create keystream */ fn(crypto_cipher_tfm(tfm), dst, ctrblk); crypto_xor(dst, src, bsize); /* increment counter in counterblock */ crypto_inc(ctrblk, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_ctr_crypt_inplace(struct skcipher_walk *walk, struct crypto_cipher *tfm) { void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = crypto_cipher_alg(tfm)->cia_encrypt; unsigned int bsize = crypto_cipher_blocksize(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); unsigned int nbytes = walk->nbytes; u8 *dst = walk->dst.virt.addr; u8 *ctrblk = walk->iv; u8 tmp[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *keystream = PTR_ALIGN(tmp + 0, alignmask + 1); do { /* create keystream */ fn(crypto_cipher_tfm(tfm), keystream, ctrblk); crypto_xor(dst, keystream, bsize); /* increment counter in counterblock */ crypto_inc(ctrblk, bsize); dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_ctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); const unsigned int bsize = crypto_cipher_blocksize(cipher); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes >= bsize) { if (walk.src.virt.addr == walk.dst.virt.addr) nbytes = crypto_ctr_crypt_inplace(&walk, cipher); else nbytes = crypto_ctr_crypt_segment(&walk, cipher); err = skcipher_walk_done(&walk, nbytes); } if (walk.nbytes) { crypto_ctr_crypt_final(&walk, cipher); err = skcipher_walk_done(&walk, 0); } return err; } static int crypto_ctr_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; struct crypto_alg *alg; int err; inst = skcipher_alloc_instance_simple(tmpl, tb); if (IS_ERR(inst)) return PTR_ERR(inst); alg = skcipher_ialg_simple(inst); /* Block size must be >= 4 bytes. */ err = -EINVAL; if (alg->cra_blocksize < 4) goto out_free_inst; /* If this is false we'd fail the alignment of crypto_inc. */ if (alg->cra_blocksize % 4) goto out_free_inst; /* CTR mode is a stream cipher. */ inst->alg.base.cra_blocksize = 1; /* * To simplify the implementation, configure the skcipher walk to only * give a partial block at the very end, never earlier. */ inst->alg.chunksize = alg->cra_blocksize; inst->alg.encrypt = crypto_ctr_crypt; inst->alg.decrypt = crypto_ctr_crypt; err = skcipher_register_instance(tmpl, inst); if (err) { out_free_inst: inst->free(inst); } return err; } static int crypto_rfc3686_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child = ctx->child; /* the nonce is stored in bytes at end of key */ if (keylen < CTR_RFC3686_NONCE_SIZE) return -EINVAL; memcpy(ctx->nonce, key + (keylen - CTR_RFC3686_NONCE_SIZE), CTR_RFC3686_NONCE_SIZE); keylen -= CTR_RFC3686_NONCE_SIZE; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, keylen); } static int crypto_rfc3686_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child = ctx->child; unsigned long align = crypto_skcipher_alignmask(tfm); struct crypto_rfc3686_req_ctx *rctx = (void *)PTR_ALIGN((u8 *)skcipher_request_ctx(req), align + 1); struct skcipher_request *subreq = &rctx->subreq; u8 *iv = rctx->iv; /* set up counter block */ memcpy(iv, ctx->nonce, CTR_RFC3686_NONCE_SIZE); memcpy(iv + CTR_RFC3686_NONCE_SIZE, req->iv, CTR_RFC3686_IV_SIZE); /* initialize counter portion of counter block */ *(__be32 *)(iv + CTR_RFC3686_NONCE_SIZE + CTR_RFC3686_IV_SIZE) = cpu_to_be32(1); skcipher_request_set_tfm(subreq, child); skcipher_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, iv); return crypto_skcipher_encrypt(subreq); } static int crypto_rfc3686_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_skcipher_spawn *spawn = skcipher_instance_ctx(inst); struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *cipher; unsigned long align; unsigned int reqsize; cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; align = crypto_skcipher_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); reqsize = align + sizeof(struct crypto_rfc3686_req_ctx) + crypto_skcipher_reqsize(cipher); crypto_skcipher_set_reqsize(tfm, reqsize); return 0; } static void crypto_rfc3686_exit_tfm(struct crypto_skcipher *tfm) { struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); crypto_free_skcipher(ctx->child); } static void crypto_rfc3686_free(struct skcipher_instance *inst) { struct crypto_skcipher_spawn *spawn = skcipher_instance_ctx(inst); crypto_drop_skcipher(spawn); kfree(inst); } static int crypto_rfc3686_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; struct crypto_skcipher_spawn *spawn; struct skcipher_alg_common *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = skcipher_instance_ctx(inst); err = crypto_grab_skcipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_skcipher_alg_common(spawn); /* We only support 16-byte blocks. */ err = -EINVAL; if (alg->ivsize != CTR_RFC3686_BLOCK_SIZE) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc3686(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc3686(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.ivsize = CTR_RFC3686_IV_SIZE; inst->alg.chunksize = alg->chunksize; inst->alg.min_keysize = alg->min_keysize + CTR_RFC3686_NONCE_SIZE; inst->alg.max_keysize = alg->max_keysize + CTR_RFC3686_NONCE_SIZE; inst->alg.setkey = crypto_rfc3686_setkey; inst->alg.encrypt = crypto_rfc3686_crypt; inst->alg.decrypt = crypto_rfc3686_crypt; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc3686_ctx); inst->alg.init = crypto_rfc3686_init_tfm; inst->alg.exit = crypto_rfc3686_exit_tfm; inst->free = crypto_rfc3686_free; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc3686_free(inst); } return err; } static struct crypto_template crypto_ctr_tmpls[] = { { .name = "ctr", .create = crypto_ctr_create, .module = THIS_MODULE, }, { .name = "rfc3686", .create = crypto_rfc3686_create, .module = THIS_MODULE, }, }; static int __init crypto_ctr_module_init(void) { return crypto_register_templates(crypto_ctr_tmpls, ARRAY_SIZE(crypto_ctr_tmpls)); } static void __exit crypto_ctr_module_exit(void) { crypto_unregister_templates(crypto_ctr_tmpls, ARRAY_SIZE(crypto_ctr_tmpls)); } module_init(crypto_ctr_module_init); module_exit(crypto_ctr_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CTR block cipher mode of operation"); MODULE_ALIAS_CRYPTO("rfc3686"); MODULE_ALIAS_CRYPTO("ctr"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
4 4 4 13 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 // SPDX-License-Identifier: GPL-2.0 /* * gendisk handling * * Portions Copyright (C) 2020 Christoph Hellwig */ #include <linux/module.h> #include <linux/ctype.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> #include <linux/blktrace_api.h> #include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" static struct kobject *block_depr; /* * Unique, monotonically increasing sequential number associated with block * devices instances (i.e. incremented each time a device is attached). * Associating uevents with block devices in userspace is difficult and racy: * the uevent netlink socket is lossy, and on slow and overloaded systems has * a very high latency. * Block devices do not have exclusive owners in userspace, any process can set * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 * can be reused again and again). * A userspace process setting up a block device and watching for its events * cannot thus reliably tell whether an event relates to the device it just set * up or another earlier instance with the same name. * This sequential number allows userspace processes to solve this problem, and * uniquely associate an uevent to the lifetime to a device. */ static atomic64_t diskseq; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); void set_capacity(struct gendisk *disk, sector_t sectors) { if (sectors > BLK_DEV_MAX_SECTORS) { pr_warn_once("%s: truncate capacity from %lld to %lld\n", disk->disk_name, sectors, BLK_DEV_MAX_SECTORS); sectors = BLK_DEV_MAX_SECTORS; } bdev_set_nr_sectors(disk->part0, sectors); } EXPORT_SYMBOL(set_capacity); /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. */ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); /* * Only print a message and send a uevent if the gendisk is user visible * and alive. This avoids spamming the log and udev when setting the * initial capacity during probing. */ if (size == capacity || !disk_live(disk) || (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info_ratelimited("%s: detected capacity change from %lld to %lld\n", disk->disk_name, capacity, size); /* * Historically we did not send a uevent for changes to/from an empty * device. */ if (!capacity || !size) return false; kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); static void part_stat_read_all(struct block_device *part, struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { stat->nsecs[group] += ptr->nsecs[group]; stat->sectors[group] += ptr->sectors[group]; stat->ios[group] += ptr->ios[group]; stat->merges[group] += ptr->merges[group]; } stat->io_ticks += ptr->io_ticks; } } static void bdev_count_inflight_rw(struct block_device *part, unsigned int inflight[2], bool mq_driver) { int write = 0; int read = 0; int cpu; if (mq_driver) { blk_mq_in_driver_rw(part, inflight); return; } for_each_possible_cpu(cpu) { read += part_stat_local_read_cpu(part, in_flight[READ], cpu); write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu); } /* * While iterating all CPUs, some IOs may be issued from a CPU already * traversed and complete on a CPU that has not yet been traversed, * causing the inflight number to be negative. */ inflight[READ] = read > 0 ? read : 0; inflight[WRITE] = write > 0 ? write : 0; } /** * bdev_count_inflight - get the number of inflight IOs for a block device. * * @part: the block device. * * Inflight here means started IO accounting, from bdev_start_io_acct() for * bio-based block device, and from blk_account_io_start() for rq-based block * device. */ unsigned int bdev_count_inflight(struct block_device *part) { unsigned int inflight[2] = {0}; bdev_count_inflight_rw(part, inflight, false); return inflight[READ] + inflight[WRITE]; } EXPORT_SYMBOL_GPL(bdev_count_inflight); /* * Can be deleted altogether. Later. * */ #define BLKDEV_MAJOR_HASH_SIZE 255 static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void (*probe)(dev_t devt); #endif } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ /** * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string * @probe: pre-devtmpfs / pre-udev callback used to create disks when their * pre-created device node is accessed. When a probe call uses * add_disk() and it fails the driver must cleanup resources. This * interface may soon be removed. * * The @name must be unique within the system. * * The return value depends on the @major input parameter: * * - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1] * then the function returns zero on success, or a negative error code * - if any unused major number was requested with @major = 0 parameter * then the return value is the allocated major number in range * [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. * * Use register_blkdev instead for any new code. */ int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) { if (major_names[index] == NULL) break; } if (index == 0) { printk("%s: failed to get major for %s\n", __func__, name); ret = -EBUSY; goto out; } major = index; ret = major; } if (major >= BLKDEV_MAJOR_MAX) { pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n", __func__, major, BLKDEV_MAJOR_MAX-1, name); ret = -EINVAL; goto out; } p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); if (p == NULL) { ret = -ENOMEM; goto out; } p->major = major; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD p->probe = probe; #endif strscpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; } if (!*n) *n = p; else ret = -EBUSY; spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", major, name); kfree(p); } out: mutex_unlock(&major_names_lock); return ret; } EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { struct blk_major_name **n; struct blk_major_name *p = NULL; int index = major_to_index(major); mutex_lock(&major_names_lock); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; if (!*n || strcmp((*n)->name, name)) { WARN_ON(1); } else { p = *n; *n = p->next; } spin_unlock(&major_names_spinlock); mutex_unlock(&major_names_lock); kfree(p); } EXPORT_SYMBOL(unregister_blkdev); int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL); if (idx == -ENOSPC) return -EBUSY; return idx; } void blk_free_ext_minor(unsigned int minor) { ida_free(&ext_devt_ida, minor); } void disk_uevent(struct gendisk *disk, enum kobject_action action) { struct block_device *part; unsigned long idx; rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, part) { if (bdev_is_partition(part) && !bdev_nr_sectors(part)) continue; if (!kobject_get_unless_zero(&part->bd_device.kobj)) continue; rcu_read_unlock(); kobject_uevent(bdev_kobj(part), action); put_device(&part->bd_device); rcu_read_lock(); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { struct file *file; int ret = 0; if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; /* * If the device is opened exclusively by current thread already, it's * safe to scan partitons, otherwise, use bd_prepare_to_claim() to * synchronize with other exclusive openers and other partition * scanners. */ if (!(mode & BLK_OPEN_EXCL)) { ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, NULL); if (ret) return ret; } set_bit(GD_NEED_PART_SCAN, &disk->state); file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, NULL); if (IS_ERR(file)) ret = PTR_ERR(file); else fput(file); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, * and this will cause that re-assemble partitioned raid device will * creat partition for underlying disk. */ clear_bit(GD_NEED_PART_SCAN, &disk->state); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(disk->part0, disk_scan_partitions); return ret; } static void add_disk_final(struct gendisk *disk) { struct device *ddev = disk_to_dev(disk); if (!(disk->flags & GENHD_FL_HIDDEN)) { /* Make sure the first partition scan will be proceed */ if (get_capacity(disk) && disk_has_partscan(disk)) set_bit(GD_NEED_PART_SCAN, &disk->state); bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) disk_scan_partitions(disk, BLK_OPEN_READ); /* * Announce the disk and partitions after all partitions are * created. (for hidden disks uevents remain suppressed forever) */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); } blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); disk_add_events(disk); set_bit(GD_ADDED, &disk->state); } static int __add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); int ret; if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS)) return -EINVAL; if (queue_is_mq(disk->queue)) { /* * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers. */ if (disk->fops->submit_bio || disk->fops->poll_bio) return -EINVAL; } else { if (!disk->fops->submit_bio) return -EINVAL; bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO); } /* * If the driver provides an explicit major number it also must provide * the number of minors numbers supported, and those will be used to * setup the gendisk. * Otherwise just allocate the device numbers for both the whole device * and all partitions from the extended dev_t space. */ ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) goto out; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", DISK_MAX_PARTS); disk->minors = DISK_MAX_PARTS; } if (disk->first_minor > MINORMASK || disk->minors > MINORMASK + 1 || disk->first_minor + disk->minors > MINORMASK + 1) goto out; } else { if (WARN_ON(disk->minors)) goto out; ret = blk_alloc_ext_minor(); if (ret < 0) goto out; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); if (fwnode) device_set_node(ddev, fwnode); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); if (ret) goto out_free_ext_minor; ret = disk_alloc_events(disk); if (ret) goto out_device_del; ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); if (ret) goto out_device_del; /* * avoid probable deadlock caused by allocating memory with * GFP_KERNEL in runtime_resume callback of its all ancestor * devices */ pm_runtime_set_memalloc_noio(ddev, true); disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); if (!disk->part0->bd_holder_dir) { ret = -ENOMEM; goto out_del_block_link; } disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (!disk->slave_dir) { ret = -ENOMEM; goto out_put_holder_dir; } ret = blk_register_queue(disk); if (ret) goto out_put_slave_dir; if (!(disk->flags & GENHD_FL_HIDDEN)) { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); if (ret) goto out_unregister_queue; bdi_set_owner(disk->bdi, ddev); ret = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, "bdi"); if (ret) goto out_unregister_bdi; } else { /* * Even if the block_device for a hidden gendisk is not * registered, it needs to have a valid bd_dev so that the * freeing of the dynamic major works. */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } return 0; out_unregister_bdi: if (!(disk->flags & GENHD_FL_HIDDEN)) bdi_unregister(disk->bdi); out_unregister_queue: blk_unregister_queue(disk); rq_qos_exit(disk->queue); out_put_slave_dir: kobject_put(disk->slave_dir); disk->slave_dir = NULL; out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_block_link: sysfs_remove_link(block_depr, dev_name(ddev)); pm_runtime_set_memalloc_noio(ddev, false); out_device_del: device_del(ddev); out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out: return ret; } /** * add_disk_fwnode - add disk information to kernel list with fwnode * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups * @fwnode: attached disk fwnode * * This function registers the partitioning information in @disk * with the kernel. Also attach a fwnode to the disk device. */ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, struct fwnode_handle *fwnode) { struct blk_mq_tag_set *set; unsigned int memflags; int ret; if (queue_is_mq(disk->queue)) { set = disk->queue->tag_set; memflags = memalloc_noio_save(); down_read(&set->update_nr_hwq_lock); ret = __add_disk(parent, disk, groups, fwnode); up_read(&set->update_nr_hwq_lock); memalloc_noio_restore(memflags); } else { ret = __add_disk(parent, disk, groups, fwnode); } /* * add_disk_final() needn't to read `nr_hw_queues`, so move it out * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary * lock dependency on `disk->open_mutex` from scanning partition. */ if (!ret) add_disk_final(disk); return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); /** * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups * * This function registers the partitioning information in @disk * with the kernel. */ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { return add_disk_fwnode(parent, disk, groups, NULL); } EXPORT_SYMBOL(device_add_disk); static void blk_report_disk_dead(struct gendisk *disk, bool surprise) { struct block_device *bdev; unsigned long idx; /* * On surprise disk removal, bdev_mark_dead() may call into file * systems below. Make it clear that we're expecting to not hold * disk->open_mutex. */ lockdep_assert_not_held(&disk->open_mutex); rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, bdev) { if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) continue; rcu_read_unlock(); bdev_mark_dead(bdev, surprise); put_device(&bdev->bd_device); rcu_read_lock(); } rcu_read_unlock(); } static bool __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) return false; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); /* * Stop buffered writers from dirtying pages that can't be written out. */ set_capacity(disk, 0); /* * Prevent new I/O from crossing bio_queue_enter(). */ return blk_queue_start_drain(disk->queue); } /** * blk_mark_disk_dead - mark a disk as dead * @disk: disk to mark as dead * * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O * to this disk. */ void blk_mark_disk_dead(struct gendisk *disk) { __blk_mark_disk_dead(disk); blk_report_disk_dead(disk, true); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); static void __del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; bool start_drain; might_sleep(); if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; disk_del_events(disk); /* * Prevent new openers by unlinked the bdev inode. */ mutex_lock(&disk->open_mutex); xa_for_each(&disk->part_tbl, idx, part) bdev_unhash(part); mutex_unlock(&disk->open_mutex); /* * Tell the file system to write back all dirty data and shut down if * it hasn't been notified earlier. */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); start_drain = __blk_mark_disk_dead(disk); if (start_drain) blk_freeze_acquire_lock(q); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ bdi_unregister(disk->bdi); } blk_unregister_queue(disk); kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); disk->slave_dir = NULL; part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); blk_mq_freeze_queue_wait(q); blk_throtl_cancel_bios(disk); blk_sync_queue(q); blk_flush_integrity(); if (queue_is_mq(q)) blk_mq_cancel_work_sync(q); rq_qos_exit(q); /* * If the disk does not own the queue, allow using passthrough requests * again. Else leave the queue frozen to fail all I/O. */ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) __blk_mq_unfreeze_queue(q, true); else if (queue_is_mq(q)) blk_mq_exit_queue(q); if (start_drain) blk_unfreeze_release_lock(q); } static void disable_elv_switch(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; WARN_ON_ONCE(!queue_is_mq(q)); down_write(&set->update_nr_hwq_lock); blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q); up_write(&set->update_nr_hwq_lock); } /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove * * Removes the gendisk and all its associated resources. This deletes the * partitions associated with the gendisk, and unregisters the associated * request_queue. * * This is the counter to the respective device_add_disk() call. * * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if * device_add_disk() was used. * * Drivers exist which depend on the release of the gendisk to be synchronous, * it should not be deferred. * * Context: can sleep */ void del_gendisk(struct gendisk *disk) { struct blk_mq_tag_set *set; unsigned int memflags; if (!queue_is_mq(disk->queue)) { __del_gendisk(disk); } else { set = disk->queue->tag_set; disable_elv_switch(disk->queue); memflags = memalloc_noio_save(); down_read(&set->update_nr_hwq_lock); __del_gendisk(disk); up_read(&set->update_nr_hwq_lock); memalloc_noio_restore(memflags); } } EXPORT_SYMBOL(del_gendisk); /** * invalidate_disk - invalidate the disk * @disk: the struct gendisk to invalidate * * A helper to invalidates the disk. It will clean the disk's associated * buffer/page caches and reset its internal states so that the disk * can be reused by the drivers. * * Context: can sleep */ void invalidate_disk(struct gendisk *disk) { struct block_device *bdev = disk->part0; invalidate_bdev(bdev); bdev->bd_mapping->wb_err = 0; set_capacity(disk, 0); } EXPORT_SYMBOL(invalidate_disk); /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, char *page) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return sysfs_emit(page, "\n"); return badblocks_show(disk->bb, page, 0); } static ssize_t disk_badblocks_store(struct device *dev, struct device_attribute *attr, const char *page, size_t len) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return -ENXIO; return badblocks_store(disk->bb, page, len, 0); } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static bool blk_probe_dev(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; mutex_lock(&major_names_lock); for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); return true; } } mutex_unlock(&major_names_lock); return false; } void blk_request_module(dev_t devt) { int error; if (blk_probe_dev(devt)) return; error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)); /* Make old-style 2.4 aliases work */ if (error > 0) error = request_module("block-major-%d", MAJOR(devt)); if (!error) blk_probe_dev(devt); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ #ifdef CONFIG_PROC_FS /* iterator */ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) { loff_t skip = *pos; struct class_dev_iter *iter; struct device *dev; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return ERR_PTR(-ENOMEM); seqf->private = iter; class_dev_iter_init(iter, &block_class, NULL, &disk_type); do { dev = class_dev_iter_next(iter); if (!dev) return NULL; } while (skip--); return dev_to_disk(dev); } static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) { struct device *dev; (*pos)++; dev = class_dev_iter_next(seqf->private); if (dev) return dev_to_disk(dev); return NULL; } static void disk_seqf_stop(struct seq_file *seqf, void *v) { struct class_dev_iter *iter = seqf->private; /* stop is called even after start failed :-( */ if (iter) { class_dev_iter_exit(iter); kfree(iter); seqf->private = NULL; } } static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { void *p; p = disk_seqf_start(seqf, pos); if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; struct block_device *part; unsigned long idx; if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; rcu_read_lock(); xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; } static const struct seq_operations partitions_op = { .start = show_partition_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = show_partition }; #endif static int __init genhd_device_init(void) { int error; error = class_register(&block_class); if (unlikely(error)) return error; blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); /* create top-level block dir */ block_depr = kobject_create_and_add("block", NULL); return 0; } subsys_initcall(genhd_device_init); static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", disk->minors); } static ssize_t disk_ext_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } static ssize_t disk_hidden_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); } static ssize_t disk_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct disk_stats stat; unsigned int inflight; inflight = bdev_count_inflight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); part_stat_unlock(); } part_stat_read_all(bdev, &stat); return sysfs_emit(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " "%8lu %8lu %8llu %8u " "%8lu %8u" "\n", stat.ios[STAT_READ], stat.merges[STAT_READ], (unsigned long long)stat.sectors[STAT_READ], (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), stat.ios[STAT_WRITE], stat.merges[STAT_WRITE], (unsigned long long)stat.sectors[STAT_WRITE], (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], (unsigned long long)stat.sectors[STAT_DISCARD], (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), stat.ios[STAT_FLUSH], (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } /* * Show the number of IOs issued to driver. * For bio-based device, started from bdev_start_io_acct(); * For rq-based device, started from blk_mq_start_request(); */ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2] = {0}; bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q)); return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]); } static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the capability attribute has been deprecated.\n"); return sysfs_emit(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%llu\n", disk->diskseq); } static ssize_t partscan_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL); static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL)); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) { if (i) bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); else bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); } return count; } static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif /* CONFIG_FAIL_MAKE_REQUEST */ #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); #endif static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, &dev_attr_hidden.attr, &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif #ifdef CONFIG_FAIL_IO_TIMEOUT &dev_attr_fail_timeout.attr, #endif NULL }; static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, typeof(*dev), kobj); struct gendisk *disk = dev_to_disk(dev); if (a == &dev_attr_badblocks.attr && !disk->bb) return 0; return a->mode; } static struct attribute_group disk_attr_group = { .attrs = disk_attrs, .is_visible = disk_visible, }; static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif #ifdef CONFIG_BLK_DEV_INTEGRITY &blk_integrity_attr_group, #endif NULL }; /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk * * This function releases all allocated resources of the gendisk. * * Drivers which used device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue * will also be freed prior to the disk. * * Context: can sleep */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); might_sleep(); WARN_ON_ONCE(disk_live(disk)); blk_trace_remove(disk->queue); /* * To undo the all initialization from blk_mq_init_allocated_queue in * case of a probe failure where add_disk is never called we have to * call blk_mq_exit_queue here. We can't do this for the more common * teardown case (yet) as the tagset can be gone by the time the disk * is released once it was added. */ if (queue_is_mq(disk->queue) && test_bit(GD_OWNS_QUEUE, &disk->state) && !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); blkcg_exit_disk(disk); bioset_exit(&disk->bio_split); disk_release_events(disk); kfree(disk->random); disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); kobject_put(&disk->queue_kobj); disk->queue->disk = NULL; blk_put_queue(disk->queue); if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk) disk->fops->free_disk(disk); bdev_drop(disk->part0); /* frees the disk */ } static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct gendisk *disk = dev_to_disk(dev); return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; static char *block_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid) { struct gendisk *disk = dev_to_disk(dev); if (disk->fops->devnode) return disk->fops->devnode(disk, mode); return NULL; } const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS /* * aggregate disk stat collector. Uses the same stats that the sysfs * entries do, above, but makes them available through one seq_file. * * The output looks suspiciously like /proc/partitions with a bunch of * extra fields. */ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; unsigned int inflight; struct disk_stats stat; unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) seq_puts(seqf, "major minor name" " rio rmerge rsect ruse wio wmerge " "wsect wuse running use aveq" "\n\n"); */ rcu_read_lock(); xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; inflight = bdev_count_inflight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); part_stat_unlock(); } part_stat_read_all(hd, &stat); seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4); seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7); seq_printf(seqf, " %pg", hd); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", inflight); seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks)); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); seq_putc(seqf, '\n'); } rcu_read_unlock(); return 0; } static const struct seq_operations diskstats_op = { .start = disk_seqf_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = diskstats_show }; static int __init proc_genhd_init(void) { proc_create_seq("diskstats", 0, NULL, &diskstats_op); proc_create_seq("partitions", 0, NULL, &partitions_op); return 0; } module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ dev_t part_devt(struct gendisk *disk, u8 partno) { struct block_device *part; dev_t devt = 0; rcu_read_lock(); part = xa_load(&disk->part_tbl, partno); if (part) devt = part->bd_dev; rcu_read_unlock(); return devt; } struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass) { struct gendisk *disk; disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) return NULL; if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0)) goto out_free_disk; disk->bdi = bdi_alloc(node_id); if (!disk->bdi) goto out_free_bioset; /* bdev_alloc() might need the queue, set before the first call */ disk->queue = q; disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); xa_init(&disk->part_tbl); if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; if (blkcg_init_disk(disk)) goto out_erase_part0; disk_init_zone_resources(disk); rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif mutex_init(&disk->rqos_state_mutex); kobject_init(&disk->queue_kobj, &blk_queue_ktype); return disk; out_erase_part0: xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; bdev_drop(disk->part0); out_free_bdi: bdi_put(disk->bdi); out_free_bioset: bioset_exit(&disk->bio_split); out_free_disk: kfree(disk); return NULL; } struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, struct lock_class_key *lkclass) { struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; q = blk_alloc_queue(lim ? lim : &default_lim, node); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_alloc_disk); /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. * * Note: for blk-mq disk put_disk must be called before freeing the tag_set * when handling probe errors (that is before add_disk() is called). * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void put_disk(struct gendisk *disk) { if (disk) put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; char *envp[] = { event, NULL }; if (!ro) event[8] = '0'; kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } /** * set_disk_ro - set a gendisk read-only * @disk: gendisk to operate on * @read_only: %true to set the disk read-only, %false set the disk read/write * * This function is used to indicate whether a given disk device should have its * read-only flag set. set_disk_ro() is typically used by device drivers to * indicate whether the underlying physical device is write-protected. */ void set_disk_ro(struct gendisk *disk, bool read_only) { if (read_only) { if (test_and_set_bit(GD_READ_ONLY, &disk->state)) return; } else { if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) return; } set_disk_ro_uevent(disk, read_only); } EXPORT_SYMBOL(set_disk_ro); void inc_diskseq(struct gendisk *disk) { disk->diskseq = atomic64_inc_return(&diskseq); }
14 1 2 2 1 2 5 4 6 3 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/tcp.h> #include <net/ip.h> #include <net/tcp.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netfilter/nfnetlink_osf.h> /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. */ struct list_head nf_osf_fingers[2]; EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } return ret; } struct nf_osf_hdr_ctx { bool df; u16 window; u16 totlen; const unsigned char *optp; unsigned int optsize; }; static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, int ttl_check, struct nf_osf_hdr_ctx *ctx) { const __u8 *optpinit = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; u16 mss = 0; if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; /* * Should not happen if userspace parser was written correctly. */ if (f->wss.wc >= OSF_WSS_MAX) return false; /* Check options */ foptsize = 0; for (optnum = 0; optnum < f->opt_num; ++optnum) foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || ctx->optsize > MAX_IPOPTLEN || ctx->optsize != foptsize) return false; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { if (f->opt[optnum].kind == *ctx->optp) { __u32 len = f->opt[optnum].length; const __u8 *optend = ctx->optp + len; fmatch = FMATCH_OK; switch (*ctx->optp) { case OSFOPT_MSS: mss = ctx->optp[3]; mss <<= 8; mss |= ctx->optp[2]; mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: break; } ctx->optp = optend; } else fmatch = FMATCH_OPT_WRONG; if (fmatch != FMATCH_OK) break; } if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; switch (check_WSS) { case OSF_WSS_PLAIN: if (f->wss.val == 0 || ctx->window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: /* * Some smart modems decrease mangle MSS to * SMART_MSS_2, so we check standard, decreased * and the one provided in the fingerprint MSS * values. */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 if (ctx->window == f->wss.val * mss || ctx->window == f->wss.val * SMART_MSS_1 || ctx->window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: if (ctx->window == f->wss.val * (mss + 40) || ctx->window == f->wss.val * (SMART_MSS_1 + 40) || ctx->window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: if ((ctx->window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } } if (fmatch != FMATCH_OK) ctx->optp = optpinit; return fmatch == FMATCH_OK; } static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, unsigned char *opts, struct tcphdr *_tcph) { const struct tcphdr *tcp; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; if (!tcp->syn) return NULL; ctx->totlen = ntohs(ip->tot_len); ctx->df = ntohs(ip->frag_off) & IP_DF; ctx->window = ntohs(tcp->window); if (tcp->doff * 4 > sizeof(struct tcphdr)) { ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); if (!ctx->optp) return NULL; } return tcp; } bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; int fcount = 0, ttl_check; int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; fmatch = FMATCH_OK; fcount++; if (info->flags & NF_OSF_LOG) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest), f->ttl - ip->ttl); if ((info->flags & NF_OSF_LOG) && info->loglevel == NF_OSF_LOGLEVEL_FIRST) break; } if (!fcount && (info->flags & NF_OSF_LOG)) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); if (fcount) fmatch = FMATCH_OK; return fmatch == FMATCH_OK; } EXPORT_SYMBOL_GPL(nf_osf_match); bool nf_osf_find(const struct sk_buff *skb, const struct list_head *nf_osf_fingers, const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; bool found = false; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; data->genre = f->genre; data->version = f->version; found = true; break; } return found; } EXPORT_SYMBOL_GPL(nf_osf_find); static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; static int nfnl_osf_add_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); if (f->opt_num > ARRAY_SIZE(f->opt)) return -EINVAL; if (!memchr(f->genre, 0, MAXGENRELEN) || !memchr(f->subtype, 0, MAXGENRELEN) || !memchr(f->version, 0, MAXGENRELEN)) return -EINVAL; kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger)); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; kfree(kf); kf = NULL; if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } /* * We are protected by nfnl mutex. */ if (kf) list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]); return err; } static int nfnl_osf_remove_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; int err = -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; /* * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); kfree_rcu(sf, rcu_head); err = 0; break; } return err; } static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, }; static const struct nfnetlink_subsystem nfnl_osf_subsys = { .name = "osf", .subsys_id = NFNL_SUBSYS_OSF, .cb_count = OSF_MSG_MAX, .cb = nfnl_osf_callbacks, }; static int __init nfnl_osf_init(void) { int err = -EINVAL; int i; for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) INIT_LIST_HEAD(&nf_osf_fingers[i]); err = nfnetlink_subsys_register(&nfnl_osf_subsys); if (err < 0) { pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); goto err_out_exit; } return 0; err_out_exit: return err; } static void __exit nfnl_osf_fini(void) { struct nf_osf_finger *f; int i; nfnetlink_subsys_unregister(&nfnl_osf_subsys); rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) { list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); kfree_rcu(f, rcu_head); } } rcu_read_unlock(); rcu_barrier(); } module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Passive OS fingerprint matching"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2017 Netronome Systems, Inc. * Copyright (C) 2019 Mellanox Technologies. All rights reserved */ #include <linux/completion.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/sysfs.h> #include "netdevsim.h" static DEFINE_IDA(nsim_bus_dev_ids); static LIST_HEAD(nsim_bus_dev_list); static DEFINE_MUTEX(nsim_bus_dev_list_lock); static bool nsim_bus_enable; static refcount_t nsim_bus_devs; /* Including the bus itself. */ static DECLARE_COMPLETION(nsim_bus_devs_released); static struct nsim_bus_dev *to_nsim_bus_dev(struct device *dev) { return container_of(dev, struct nsim_bus_dev, dev); } static ssize_t nsim_bus_dev_numvfs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int num_vfs; int ret; ret = kstrtouint(buf, 0, &num_vfs); if (ret) return ret; device_lock(dev); ret = -ENOENT; if (dev_get_drvdata(dev)) ret = nsim_drv_configure_vfs(nsim_bus_dev, num_vfs); device_unlock(dev); return ret ? ret : count; } static ssize_t nsim_bus_dev_numvfs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return sprintf(buf, "%u\n", nsim_bus_dev->num_vfs); } static struct device_attribute nsim_bus_dev_numvfs_attr = __ATTR(sriov_numvfs, 0664, nsim_bus_dev_numvfs_show, nsim_bus_dev_numvfs_store); static ssize_t new_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); u8 eth_addr[ETH_ALEN] = {}; unsigned int port_index; bool addr_set = false; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = sscanf(buf, "%u %hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &port_index, &eth_addr[0], &eth_addr[1], &eth_addr[2], &eth_addr[3], &eth_addr[4], &eth_addr[5]); switch (ret) { case 7: if (!is_valid_ether_addr(eth_addr)) { pr_err("The supplied perm_addr is not a valid MAC address\n"); return -EINVAL; } addr_set = true; fallthrough; case 1: break; default: pr_err("Format for adding new port is \"id [perm_addr]\" (uint MAC).\n"); return -EINVAL; } ret = nsim_drv_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index, addr_set ? eth_addr : NULL); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_new_port_attr = __ATTR_WO(new_port); static ssize_t del_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_del_port_attr = __ATTR_WO(del_port); static struct attribute *nsim_bus_dev_attrs[] = { &nsim_bus_dev_numvfs_attr.attr, &nsim_bus_dev_new_port_attr.attr, &nsim_bus_dev_del_port_attr.attr, NULL, }; static const struct attribute_group nsim_bus_dev_attr_group = { .attrs = nsim_bus_dev_attrs, }; static const struct attribute_group *nsim_bus_dev_attr_groups[] = { &nsim_bus_dev_attr_group, NULL, }; static void nsim_bus_dev_release(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev; nsim_bus_dev = container_of(dev, struct nsim_bus_dev, dev); kfree(nsim_bus_dev); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); } static const struct device_type nsim_bus_dev_type = { .groups = nsim_bus_dev_attr_groups, .release = nsim_bus_dev_release, }; static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues); static ssize_t new_device_store(const struct bus_type *bus, const char *buf, size_t count) { unsigned int id, port_count, num_queues; struct nsim_bus_dev *nsim_bus_dev; int err; err = sscanf(buf, "%u %u %u", &id, &port_count, &num_queues); switch (err) { case 1: port_count = 1; fallthrough; case 2: num_queues = 1; fallthrough; case 3: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for adding new device is \"id port_count num_queues\" (uint uint unit).\n"); return -EINVAL; } mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { err = -EBUSY; goto err; } nsim_bus_dev = nsim_bus_dev_new(id, port_count, num_queues); if (IS_ERR(nsim_bus_dev)) { err = PTR_ERR(nsim_bus_dev); goto err; } refcount_inc(&nsim_bus_devs); /* Allow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, true); list_add_tail(&nsim_bus_dev->list, &nsim_bus_dev_list); mutex_unlock(&nsim_bus_dev_list_lock); return count; err: mutex_unlock(&nsim_bus_dev_list_lock); return err; } static BUS_ATTR_WO(new_device); static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev); static ssize_t del_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev, *tmp; unsigned int id; int err; err = sscanf(buf, "%u", &id); switch (err) { case 1: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for deleting device is \"id\" (uint).\n"); return -EINVAL; } err = -ENOENT; mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { mutex_unlock(&nsim_bus_dev_list_lock); return -EBUSY; } list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { if (nsim_bus_dev->dev.id != id) continue; list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); err = 0; break; } mutex_unlock(&nsim_bus_dev_list_lock); return !err ? count : err; } static BUS_ATTR_WO(del_device); static ssize_t link_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim_a, *nsim_b, *peer; struct net_device *dev_a, *dev_b; unsigned int ifidx_a, ifidx_b; int netnsfd_a, netnsfd_b, err; struct net *ns_a, *ns_b; err = sscanf(buf, "%d:%u %d:%u", &netnsfd_a, &ifidx_a, &netnsfd_b, &ifidx_b); if (err != 4) { pr_err("Format for linking two devices is \"netnsfd_a:ifidx_a netnsfd_b:ifidx_b\" (int uint int uint).\n"); return -EINVAL; } ns_a = get_net_ns_by_fd(netnsfd_a); if (IS_ERR(ns_a)) { pr_err("Could not find netns with fd: %d\n", netnsfd_a); return -EINVAL; } ns_b = get_net_ns_by_fd(netnsfd_b); if (IS_ERR(ns_b)) { pr_err("Could not find netns with fd: %d\n", netnsfd_b); put_net(ns_a); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev_a = __dev_get_by_index(ns_a, ifidx_a); if (!dev_a) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_a, netnsfd_a); goto out_err; } if (!netdev_is_nsim(dev_a)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_a, netnsfd_a); goto out_err; } dev_b = __dev_get_by_index(ns_b, ifidx_b); if (!dev_b) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_b, netnsfd_b); goto out_err; } if (!netdev_is_nsim(dev_b)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_b, netnsfd_b); goto out_err; } if (dev_a == dev_b) { pr_err("Cannot link a netdevsim to itself\n"); goto out_err; } err = -EBUSY; nsim_a = netdev_priv(dev_a); peer = rtnl_dereference(nsim_a->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_a, ifidx_a); goto out_err; } nsim_b = netdev_priv(dev_b); peer = rtnl_dereference(nsim_b->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_b, ifidx_b); goto out_err; } err = 0; rcu_assign_pointer(nsim_a->peer, nsim_b); rcu_assign_pointer(nsim_b->peer, nsim_a); if (netif_running(dev_a) && netif_running(dev_b)) { netif_carrier_on(dev_a); netif_carrier_on(dev_b); } out_err: put_net(ns_b); put_net(ns_a); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(link_device); static ssize_t unlink_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim, *peer; struct net_device *dev; unsigned int ifidx; int netnsfd, err; struct net *ns; err = sscanf(buf, "%u:%u", &netnsfd, &ifidx); if (err != 2) { pr_err("Format for unlinking a device is \"netnsfd:ifidx\" (int uint).\n"); return -EINVAL; } ns = get_net_ns_by_fd(netnsfd); if (IS_ERR(ns)) { pr_err("Could not find netns with fd: %d\n", netnsfd); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev = __dev_get_by_index(ns, ifidx); if (!dev) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx, netnsfd); goto out_put_netns; } if (!netdev_is_nsim(dev)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx, netnsfd); goto out_put_netns; } nsim = netdev_priv(dev); peer = rtnl_dereference(nsim->peer); if (!peer) goto out_put_netns; netif_carrier_off(dev); netif_carrier_off(peer->netdev); err = 0; RCU_INIT_POINTER(nsim->peer, NULL); RCU_INIT_POINTER(peer->peer, NULL); synchronize_net(); netif_tx_wake_all_queues(dev); netif_tx_wake_all_queues(peer->netdev); out_put_netns: put_net(ns); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(unlink_device); static struct attribute *nsim_bus_attrs[] = { &bus_attr_new_device.attr, &bus_attr_del_device.attr, &bus_attr_link_device.attr, &bus_attr_unlink_device.attr, NULL }; ATTRIBUTE_GROUPS(nsim_bus); static int nsim_bus_probe(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_drv_probe(nsim_bus_dev); } static void nsim_bus_remove(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); nsim_drv_remove(nsim_bus_dev); } static int nsim_num_vf(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_bus_dev->num_vfs; } static const struct bus_type nsim_bus = { .name = DRV_NAME, .dev_name = DRV_NAME, .bus_groups = nsim_bus_groups, .probe = nsim_bus_probe, .remove = nsim_bus_remove, .num_vf = nsim_num_vf, }; #define NSIM_BUS_DEV_MAX_VFS 4 static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues) { struct nsim_bus_dev *nsim_bus_dev; int err; nsim_bus_dev = kzalloc(sizeof(*nsim_bus_dev), GFP_KERNEL); if (!nsim_bus_dev) return ERR_PTR(-ENOMEM); err = ida_alloc_range(&nsim_bus_dev_ids, id, id, GFP_KERNEL); if (err < 0) goto err_nsim_bus_dev_free; nsim_bus_dev->dev.id = err; nsim_bus_dev->dev.bus = &nsim_bus; nsim_bus_dev->dev.type = &nsim_bus_dev_type; nsim_bus_dev->port_count = port_count; nsim_bus_dev->num_queues = num_queues; nsim_bus_dev->initial_net = current->nsproxy->net_ns; nsim_bus_dev->max_vfs = NSIM_BUS_DEV_MAX_VFS; /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); err = device_register(&nsim_bus_dev->dev); if (err) goto err_nsim_bus_dev_id_free; return nsim_bus_dev; err_nsim_bus_dev_id_free: ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); put_device(&nsim_bus_dev->dev); nsim_bus_dev = NULL; err_nsim_bus_dev_free: kfree(nsim_bus_dev); return ERR_PTR(err); } static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev) { /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); device_unregister(&nsim_bus_dev->dev); } static struct device_driver nsim_driver = { .name = DRV_NAME, .bus = &nsim_bus, .owner = THIS_MODULE, }; int nsim_bus_init(void) { int err; err = bus_register(&nsim_bus); if (err) return err; err = driver_register(&nsim_driver); if (err) goto err_bus_unregister; refcount_set(&nsim_bus_devs, 1); /* Allow using resources */ smp_store_release(&nsim_bus_enable, true); return 0; err_bus_unregister: bus_unregister(&nsim_bus); return err; } void nsim_bus_exit(void) { struct nsim_bus_dev *nsim_bus_dev, *tmp; /* Disallow using resources */ smp_store_release(&nsim_bus_enable, false); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); mutex_lock(&nsim_bus_dev_list_lock); list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); } mutex_unlock(&nsim_bus_dev_list_lock); wait_for_completion(&nsim_bus_devs_released); driver_unregister(&nsim_driver); bus_unregister(&nsim_bus); }
1356 1352 1100 1175 156 24 30 207 566 583 589 38 557 376 179 285 5 7 445 725 723 721 646 725 10 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NEIGHBOUR_H #define _NET_NEIGHBOUR_H #include <linux/neighbour.h> /* * Generic neighbour manipulation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * * Harald Welte: <laforge@gnumonks.org> * - Add neighbour cache statistics like rtstat */ #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/neighbour_tables.h> /* * NUD stands for "neighbor unreachability detection" */ #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) struct neighbour; enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_INTERVAL_PROBE_TIME_MS, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME, #define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1) /* Following are used as a second way to access one of the above */ NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */ NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */ NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */ /* Following are used by "default" only */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX }; struct neigh_parms { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head list; int (*neigh_setup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; int dead; refcount_t refcnt; struct rcu_head rcu_head; int reachable_time; u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); WRITE_ONCE(p->data[index], val); } #define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) #define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr)) #define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr))) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ #define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) { bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX); } static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p) { bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX); } struct neigh_statistics { unsigned long allocs; /* number of allocated neighs */ unsigned long destroys; /* number of destroyed neighs */ unsigned long hash_grows; /* number of hash resizes */ unsigned long res_failed; /* number of failed resolutions */ unsigned long lookups; /* number of lookups */ unsigned long hits; /* number of hits (among lookups) */ unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ unsigned long periodic_gc_runs; /* number of periodic GC runs */ unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { struct hlist_node hash; struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; unsigned long updated; rwlock_t lock; refcount_t refcnt; unsigned int arp_queue_len_bytes; struct sk_buff_head arp_queue; struct timer_list timer; unsigned long used; atomic_t probes; u8 nud_state; u8 type; u8 dead; u8 protocol; u32 flags; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; struct list_head managed_list; struct rcu_head rcu; struct net_device *dev; netdevice_tracker dev_tracker; u8 primary_key[]; } __randomize_layout; struct neigh_ops { int family; void (*solicit)(struct neighbour *, struct sk_buff *); void (*error_report)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *); int (*connected_output)(struct neighbour *, struct sk_buff *); }; struct pneigh_entry { struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; union { struct list_head free_node; struct rcu_head rcu; }; u32 flags; u8 protocol; bool permanent; u32 key[]; }; /* * neighbour table manipulation */ #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; }; struct neigh_table { int family; unsigned int entry_size; unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; int gc_interval; int gc_thresh1; int gc_thresh2; int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; struct delayed_work managed_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; spinlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct mutex phash_lock; struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; } #define NEIGH_PRIV_ALIGN sizeof(long long) #define NEIGH_ENTRY_SIZE(size) ALIGN((size), NEIGH_PRIV_ALIGN) static inline void *neighbour_priv(const struct neighbour *n) { return (char *)n + n->tbl->entry_size; } /* flags for neigh_update() */ #define NEIGH_UPDATE_F_OVERRIDE BIT(0) #define NEIGH_UPDATE_F_WEAK_OVERRIDE BIT(1) #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER BIT(2) #define NEIGH_UPDATE_F_USE BIT(3) #define NEIGH_UPDATE_F_MANAGED BIT(4) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) #define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 #define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) #define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; #define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) #define neigh_for_each_in_bucket_rcu(pos, head) \ hlist_for_each_entry_rcu(pos, head, hash) #define neigh_for_each_in_bucket_safe(pos, tmp, head) \ hlist_for_each_entry_safe(pos, tmp, head, hash) static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; } static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) { const u32 *n32 = (const u32 *)n->primary_key; const u32 *p32 = pkey; return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; } static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; return NULL; } static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } static inline void neigh_confirm(struct neighbour *n) { if (n) { unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); } } void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } unsigned long neigh_rand_reach_time(unsigned long base); static inline void neigh_set_reach_time(struct neigh_parms *p) { unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME); WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base)); } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, u32 flags, u8 protocol, bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } void neigh_app_ns(struct neighbour *n); void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; unsigned int flags; #define NEIGH_SEQ_NEIGH_ONLY 0x00000001 #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *proc_handler); void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { refcount_dec(&parms->refcnt); } static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms) { refcount_inc(&parms->refcnt); return parms; } /* * Neighbour references */ static inline void neigh_release(struct neighbour *neigh) { if (refcount_dec_and_test(&neigh->refcnt)) neigh_destroy(neigh); } static inline struct neighbour * neigh_clone(struct neighbour *neigh) { if (neigh) refcount_inc(&neigh->refcnt); return neigh; } #define neigh_hold(n) refcount_inc(&(n)->refcnt) static __always_inline int neigh_event_send_probe(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { unsigned long now = jiffies; if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) return __neigh_event_send(neigh, skb, immediate_ok); return 0; } static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { return neigh_event_send_probe(neigh, skb, true); } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; do { seq = read_seqbegin(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; } #endif static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; /* skb_push() would proceed silently if we have room for * the unaligned size but not for the aligned size: * check headroom explicitly. */ if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } } else { hh_alen = HH_DATA_ALIGN(hh_len); if (likely(skb_headroom(skb) >= hh_alen)) { memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } } } while (read_seqretry(&hh->hh_lock, seq)); if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { kfree_skb(skb); return NET_XMIT_DROP; } __skb_push(skb, hh_len); return dev_queue_xmit(skb); } static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache) { const struct hh_cache *hh = &n->hh; /* n->nud_state and hh->hh_len could be changed under us. * neigh_hh_output() is taking care of the race later. */ if (!skip_cache && (READ_ONCE(n->nud_state) & NUD_CONNECTED) && READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); return READ_ONCE(n->output)(n, skb); } static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n || !creat) return n; n = neigh_create(tbl, pkey, dev); return IS_ERR(n) ? NULL : n; } static inline struct neighbour * __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n) return n; return neigh_create(tbl, pkey, dev); } struct neighbour_cb { unsigned long sched_next; unsigned int flags; }; #define LOCALLY_ENQUEUED 0x1 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, const struct net_device *dev) { unsigned int seq; do { seq = read_seqbegin(&n->ha_lock); memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags, int *notify) { u8 ndm_flags = 0; ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0; if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) { if (ndm_flags & NTF_ROUTER) neigh->flags |= NTF_ROUTER; else neigh->flags &= ~NTF_ROUTER; *notify = 1; } } #endif
10 20 18 20 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #ifndef _WG_QUEUEING_H #define _WG_QUEUEING_H #include "peer.h" #include <linux/types.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip_tunnels.h> struct wg_device; struct wg_peer; struct multicore_worker; struct crypt_queue; struct prev_queue; struct sk_buff; /* queueing.c APIs: */ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, unsigned int len); void wg_packet_queue_free(struct crypt_queue *queue, bool purge); struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); /* receive.c APIs: */ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb); void wg_packet_handshake_receive_worker(struct work_struct *work); /* NAPI poll function: */ int wg_packet_rx_poll(struct napi_struct *napi, int budget); /* Workqueue worker: */ void wg_packet_decrypt_worker(struct work_struct *work); /* send.c APIs: */ void wg_packet_send_queued_handshake_initiation(struct wg_peer *peer, bool is_retry); void wg_packet_send_handshake_response(struct wg_peer *peer); void wg_packet_send_handshake_cookie(struct wg_device *wg, struct sk_buff *initiating_skb, __le32 sender_index); void wg_packet_send_keepalive(struct wg_peer *peer); void wg_packet_purge_staged_packets(struct wg_peer *peer); void wg_packet_send_staged_packets(struct wg_peer *peer); /* Workqueue workers: */ void wg_packet_handshake_send_worker(struct work_struct *work); void wg_packet_tx_worker(struct work_struct *work); void wg_packet_encrypt_worker(struct work_struct *work); enum packet_state { PACKET_STATE_UNCRYPTED, PACKET_STATE_CRYPTED, PACKET_STATE_DEAD }; struct packet_cb { u64 nonce; struct noise_keypair *keypair; atomic_t state; u32 mtu; u8 ds; }; #define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb)) #define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer) static inline bool wg_check_packet_protocol(struct sk_buff *skb) { __be16 real_protocol = ip_tunnel_parse_protocol(skb); return real_protocol && skb->protocol == real_protocol; } static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating) { u8 l4_hash = skb->l4_hash; u8 sw_hash = skb->sw_hash; u32 hash = skb->hash; skb_scrub_packet(skb, true); memset(&skb->headers, 0, sizeof(skb->headers)); if (encapsulating) { skb->l4_hash = l4_hash; skb->sw_hash = sw_hash; skb->hash = hash; } skb->queue_mapping = 0; skb->nohdr = 0; skb->peeked = 0; skb->mac_len = 0; skb->dev = NULL; #ifdef CONFIG_NET_SCHED skb->tc_index = 0; #endif skb_reset_redirect(skb); skb->hdr_len = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_probe_transport_header(skb); skb_reset_inner_headers(skb); } static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) { unsigned int cpu = *stored_cpu; while (unlikely(cpu >= nr_cpu_ids || !cpu_online(cpu))) cpu = *stored_cpu = cpumask_nth(id % num_online_cpus(), cpu_online_mask); return cpu; } /* This function is racy, in the sense that it's called while last_cpu is * unlocked, so it could return the same CPU twice. Adding locking or using * atomic sequence numbers is slower though, and the consequences of racing are * harmless, so live with it. */ static inline int wg_cpumask_next_online(int *last_cpu) { int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); WRITE_ONCE(*last_cpu, cpu); return cpu; } void wg_prev_queue_init(struct prev_queue *queue); /* Multi producer */ bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); /* Single consumer */ struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); /* Single consumer */ static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) { if (queue->peeked) return queue->peeked; queue->peeked = wg_prev_queue_dequeue(queue); return queue->peeked; } /* Single consumer */ static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) { queue->peeked = NULL; } static inline int wg_queue_enqueue_per_device_and_peer( struct crypt_queue *device_queue, struct prev_queue *peer_queue, struct sk_buff *skb, struct workqueue_struct *wq) { int cpu; atomic_set_release(&PACKET_CB(skb)->state, PACKET_STATE_UNCRYPTED); /* We first queue this up for the peer ingestion, but the consumer * will wait for the state to change to CRYPTED or DEAD before. */ if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) return -ENOSPC; /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ cpu = wg_cpumask_next_online(&device_queue->last_cpu); if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb))) return -EPIPE; queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work); return 0; } static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), peer->device->packet_crypt_wq, &peer->transmit_packet_work); wg_peer_put(peer); } static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); napi_schedule(&peer->napi); wg_peer_put(peer); } #ifdef DEBUG bool wg_packet_counter_selftest(void); #endif #endif /* _WG_QUEUEING_H */
2 2 7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 // SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (state->pf) { #ifdef CONFIG_NF_TABLES_IPV4 case NFPROTO_IPV4: nft_set_pktinfo_ipv4(&pkt); break; #endif #ifdef CONFIG_NF_TABLES_IPV6 case NFPROTO_IPV6: nft_set_pktinfo_ipv6(&pkt); break; #endif default: break; } return nft_do_chain(&pkt, priv); } #ifdef CONFIG_NF_TABLES_IPV4 static const struct nft_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV4, .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { [NF_INET_PRE_ROUTING] = nft_nat_do_chain, [NF_INET_POST_ROUTING] = nft_nat_do_chain, [NF_INET_LOCAL_OUT] = nft_nat_do_chain, [NF_INET_LOCAL_IN] = nft_nat_do_chain, }, .ops_register = nf_nat_ipv4_register_fn, .ops_unregister = nf_nat_ipv4_unregister_fn, }; #endif #ifdef CONFIG_NF_TABLES_IPV6 static const struct nft_chain_type nft_chain_nat_ipv6 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV6, .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { [NF_INET_PRE_ROUTING] = nft_nat_do_chain, [NF_INET_POST_ROUTING] = nft_nat_do_chain, [NF_INET_LOCAL_OUT] = nft_nat_do_chain, [NF_INET_LOCAL_IN] = nft_nat_do_chain, }, .ops_register = nf_nat_ipv6_register_fn, .ops_unregister = nf_nat_ipv6_unregister_fn, }; #endif #ifdef CONFIG_NF_TABLES_INET static int nft_nat_inet_reg(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_inet_register_fn(net, ops); } static void nft_nat_inet_unreg(struct net *net, const struct nf_hook_ops *ops) { nf_nat_inet_unregister_fn(net, ops); } static const struct nft_chain_type nft_chain_nat_inet = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_INET, .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_PRE_ROUTING] = nft_nat_do_chain, [NF_INET_LOCAL_IN] = nft_nat_do_chain, [NF_INET_LOCAL_OUT] = nft_nat_do_chain, [NF_INET_POST_ROUTING] = nft_nat_do_chain, }, .ops_register = nft_nat_inet_reg, .ops_unregister = nft_nat_inet_unreg, }; #endif static int __init nft_chain_nat_init(void) { #ifdef CONFIG_NF_TABLES_IPV6 nft_register_chain_type(&nft_chain_nat_ipv6); #endif #ifdef CONFIG_NF_TABLES_IPV4 nft_register_chain_type(&nft_chain_nat_ipv4); #endif #ifdef CONFIG_NF_TABLES_INET nft_register_chain_type(&nft_chain_nat_inet); #endif return 0; } static void __exit nft_chain_nat_exit(void) { #ifdef CONFIG_NF_TABLES_IPV4 nft_unregister_chain_type(&nft_chain_nat_ipv4); #endif #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_chain_type(&nft_chain_nat_ipv6); #endif #ifdef CONFIG_NF_TABLES_INET nft_unregister_chain_type(&nft_chain_nat_inet); #endif } module_init(nft_chain_nat_init); module_exit(nft_chain_nat_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("nftables network address translation support"); #ifdef CONFIG_NF_TABLES_IPV4 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); #endif #ifdef CONFIG_NF_TABLES_IPV6 MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat"); #endif #ifdef CONFIG_NF_TABLES_INET MODULE_ALIAS_NFT_CHAIN(1, "nat"); /* NFPROTO_INET */ #endif
1676 1676 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * Implements 3 trace clock variants, with differing scalability/precision * tradeoffs: * * - local: CPU-local trace clock * - medium: scalable global clock with some jitter * - global: globally monotonic, serialized clock * * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> #include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> /* * trace_clock_local(): the simplest and least coherent tracing clock. * * Useful for tracing that does not cross to other CPUs nor * does it go through idle events. */ u64 notrace trace_clock_local(void) { u64 clock; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ preempt_disable_notrace(); clock = sched_clock(); preempt_enable_notrace(); return clock; } EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of * jitter between CPUs. So it's a pretty scalable clock, but there * can be offsets in the trace data. */ u64 notrace trace_clock(void) { return local_clock(); } EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. * Note that this use of jiffies_64 is not completely safe on * 32-bit systems. But the window is tiny, and the effect if * we are affected is that we will have an obviously bogus * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock * * It has higher overhead than the other trace clocks but is still * an order of magnitude faster than GTOD derived hardware clocks. * * Used by plugins that need globally coherent timestamps. */ /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); /* * The global clock "guarantees" that the events are ordered * between CPUs. But if two events on two different CPUS call * trace_clock_global at roughly the same time, it really does * not matter which one gets the earlier time. Just make sure * that the same CPU will always show a monotonic clock. * * Use a read memory barrier to get the latest written * time that was recorded. */ smp_rmb(); prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) now = prev_time; /* * If in an NMI context then dont risk lockups and simply return * the current time. */ if (unlikely(in_nmi())) goto out; /* Tracing can cause strange recursion, always use a try lock */ if (arch_spin_trylock(&trace_clock_struct.lock)) { /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) now = prev_time; trace_clock_struct.prev_time = now; /* The unlock acts as the wmb for the above rmb */ arch_spin_unlock(&trace_clock_struct.lock); } out: raw_local_irq_restore(flags); return now; } EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; /* * trace_clock_counter(): simply an atomic counter. * Use the trace_counter "counter" for cases where you do not care * about timings, but are interested in strict ordering. */ u64 notrace trace_clock_counter(void) { return atomic64_inc_return(&trace_counter); }
1 1 1 1 1 1 1 1 1 13 13 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 10 13 6 1 1 2 2 1 2 3 6 6 13 13 13 13 13 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_sfb.c Stochastic Fair Blue * * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr> * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> * * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: * A New Class of Active Queue Management Algorithms. * U. Michigan CSE-TR-387-99, April 1999. * * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/siphash.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/inet_ecn.h> /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) * This implementation uses L = 8 and N = 16 * This permits us to split one 32bit hash (provided per packet by rxhash or * external classifier) into 8 subhashes of 4 bits. */ #define SFB_BUCKET_SHIFT 4 #define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */ #define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) #define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */ /* SFB algo uses a virtual queue, named "bin" */ struct sfb_bucket { u16 qlen; /* length of virtual queue */ u16 p_mark; /* marking probability */ }; /* We use a double buffering right before hash change * (Section 4.4 of SFB reference : moving hash functions) */ struct sfb_bins { siphash_key_t perturbation; /* siphash key */ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; }; struct sfb_sched_data { struct Qdisc *qdisc; struct tcf_proto __rcu *filter_list; struct tcf_block *block; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; u32 bin_size; /* maximum queue length per bin */ u32 increment; /* d1 */ u32 decrement; /* d2 */ u32 limit; /* HARD maximal queue length */ u32 penalty_rate; u32 penalty_burst; u32 tokens_avail; unsigned long rehash_time; unsigned long token_time; u8 slot; /* current active bins (0 or 1) */ bool double_buffering; struct sfb_bins bins[2]; struct { u32 earlydrop; u32 penaltydrop; u32 bucketdrop; u32 queuedrop; u32 childdrop; /* drops in child qdisc */ u32 marked; /* ECN mark */ } stats; }; /* * Each queued skb might be hashed on one or two bins * We store in skb_cb the two hash values. * (A zero value means double buffering was not used) */ struct sfb_skb_cb { u32 hashes[2]; }; static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } /* * If using 'internal' SFB flow classifier, hash comes from skb rxhash * If using external classifier, hash comes from the classid. */ static u32 sfb_hash(const struct sk_buff *skb, u32 slot) { return sfb_skb_cb(skb)->hashes[slot]; } /* Probabilities are coded as Q0.16 fixed-point values, * with 0xFFFF representing 65535/65536 (almost 1.0) * Addition and subtraction are saturating in [0, 65535] */ static u32 prob_plus(u32 p1, u32 p2) { u32 res = p1 + p2; return min_t(u32, res, SFB_MAX_PROB); } static u32 prob_minus(u32 p1, u32 p2) { return p1 > p2 ? p1 - p2 : 0; } static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) { int i; struct sfb_bucket *b = &q->bins[slot].bins[0][0]; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen < 0xFFFF) b[hash].qlen++; b += SFB_NUMBUCKETS; /* next level */ } } static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } static void decrement_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) { int i; struct sfb_bucket *b = &q->bins[slot].bins[0][0]; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen > 0) b[hash].qlen--; b += SFB_NUMBUCKETS; /* next level */ } } static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) { u32 sfbhash; sfbhash = sfb_hash(skb, 0); if (sfbhash) decrement_one_qlen(sfbhash, 0, q); sfbhash = sfb_hash(skb, 1); if (sfbhash) decrement_one_qlen(sfbhash, 1, q); } static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { b->p_mark = prob_minus(b->p_mark, q->decrement); } static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { b->p_mark = prob_plus(b->p_mark, q->increment); } static void sfb_zero_all_buckets(struct sfb_sched_data *q) { memset(&q->bins, 0, sizeof(q->bins)); } /* * compute max qlen, max p_mark, and avg p_mark */ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) { int i; u32 qlen = 0, prob = 0, totalpm = 0; const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { if (qlen < b->qlen) qlen = b->qlen; totalpm += b->p_mark; if (prob < b->p_mark) prob = b->p_mark; b++; } *prob_r = prob; *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); return qlen; } static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) { get_random_bytes(&q->bins[slot].perturbation, sizeof(q->bins[slot].perturbation)); } static void sfb_swap_slot(struct sfb_sched_data *q) { sfb_init_perturbation(q->slot, q); q->slot ^= 1; q->double_buffering = false; } /* Non elastic flows are allowed to use part of the bandwidth, expressed * in "penalty_rate" packets per second, with "penalty_burst" burst */ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) { if (q->penalty_rate == 0 || q->penalty_burst == 0) return true; if (q->tokens_avail < 1) { unsigned long age = min(10UL * HZ, jiffies - q->token_time); q->tokens_avail = (age * q->penalty_rate) / HZ; if (q->tokens_avail > q->penalty_burst) q->tokens_avail = q->penalty_burst; q->token_time = jiffies; if (q->tokens_avail < 1) return true; } q->tokens_avail--; return false; } static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, int *qerr, u32 *salt) { struct tcf_result res; int result; result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return false; } #endif *salt = TC_H_MIN(res.classid); return true; } return false; } static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; u32 r, sfbhash; u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); q->stats.queuedrop++; goto drop; } if (q->rehash_interval > 0) { unsigned long limit = q->rehash_time + q->rehash_interval; if (unlikely(time_after(jiffies, limit))) { sfb_swap_slot(q); q->rehash_time = jiffies; } else if (unlikely(!q->double_buffering && q->warmup_time > 0 && time_after(jiffies, limit - q->warmup_time))) { q->double_buffering = true; } } fl = rcu_dereference_bh(q->filter_list); if (fl) { u32 salt; /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation); } else { sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); } if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; sfbhash >>= SFB_BUCKET_SHIFT; if (b->qlen == 0) decrement_prob(b, q); else if (b->qlen >= q->bin_size) increment_prob(b, q); if (minqlen > b->qlen) minqlen = b->qlen; if (p_min > b->p_mark) p_min = b->p_mark; } slot ^= 1; sfb_skb_cb(skb)->hashes[slot] = 0; if (unlikely(minqlen >= q->max)) { qdisc_qstats_overlimit(sch); q->stats.bucketdrop++; goto drop; } if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; sfbhash >>= SFB_BUCKET_SHIFT; if (b->qlen == 0) decrement_prob(b, q); else if (b->qlen >= q->bin_size) increment_prob(b, q); } } if (sfb_rate_limit(skb, q)) { qdisc_qstats_overlimit(sch); q->stats.penaltydrop++; goto drop; } goto enqueue; } r = get_random_u16() & SFB_MAX_PROB; reason = SKB_DROP_REASON_QDISC_CONGESTED; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { /* If we're marking that many packets, then either * this flow is unresponsive, or we're badly congested. * In either case, we want to start dropping packets. */ if (r < (p_min - SFB_MAX_PROB / 2) * 2) { q->stats.earlydrop++; goto drop; } } if (INET_ECN_set_ce(skb)) { q->stats.marked++; } else { q->stats.earlydrop++; goto drop; } } enqueue: memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { sch->qstats.backlog += len; sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); } return ret; drop: qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } static struct sk_buff *sfb_dequeue(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; struct sk_buff *skb; skb = child->dequeue(q->qdisc); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; decrement_qlen(skb, q); } return skb; } static struct sk_buff *sfb_peek(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; return child->ops->peek(child); } /* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ static void sfb_reset(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); if (likely(q->qdisc)) qdisc_reset(q->qdisc); q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); sfb_init_perturbation(0, q); } static void sfb_destroy(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); qdisc_put(q->qdisc); } static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) }, }; static const struct tc_sfb_qopt sfb_default_ops = { .rehash_interval = 600 * MSEC_PER_SEC, .warmup_time = 60 * MSEC_PER_SEC, .limit = 0, .max = 25, .bin_size = 20, .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ .decrement = (SFB_MAX_PROB + 3000) / 6000, .penalty_rate = 10, .penalty_burst = 20, }; static int sfb_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child, *old; struct nlattr *tb[TCA_SFB_MAX + 1]; const struct tc_sfb_qopt *ctl = &sfb_default_ops; u32 limit; int err; if (opt) { err = nla_parse_nested_deprecated(tb, TCA_SFB_MAX, opt, sfb_policy, NULL); if (err < 0) return -EINVAL; if (tb[TCA_SFB_PARMS] == NULL) return -EINVAL; ctl = nla_data(tb[TCA_SFB_PARMS]); } limit = ctl->limit; if (limit == 0) limit = qdisc_dev(sch)->tx_queue_len; child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit, extack); if (IS_ERR(child)) return PTR_ERR(child); if (child != &noop_qdisc) qdisc_hash_add(child, true); sch_tree_lock(sch); qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); q->warmup_time = msecs_to_jiffies(ctl->warmup_time); q->rehash_time = jiffies; q->limit = limit; q->increment = ctl->increment; q->decrement = ctl->decrement; q->max = ctl->max; q->bin_size = ctl->bin_size; q->penalty_rate = ctl->penalty_rate; q->penalty_burst = ctl->penalty_burst; q->tokens_avail = ctl->penalty_burst; q->token_time = jiffies; q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); sfb_init_perturbation(0, q); sfb_init_perturbation(1, q); sch_tree_unlock(sch); qdisc_put(old); return 0; } static int sfb_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); int err; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->qdisc = &noop_qdisc; return sfb_change(sch, opt, extack); } static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfb_sched_data *q = qdisc_priv(sch); struct nlattr *opts; struct tc_sfb_qopt opt = { .rehash_interval = jiffies_to_msecs(q->rehash_interval), .warmup_time = jiffies_to_msecs(q->warmup_time), .limit = q->limit, .max = q->max, .bin_size = q->bin_size, .increment = q->increment, .decrement = q->decrement, .penalty_rate = q->penalty_rate, .penalty_burst = q->penalty_burst, }; sch->qstats.backlog = q->qdisc->qstats.backlog; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct sfb_sched_data *q = qdisc_priv(sch); struct tc_sfb_xstats st = { .earlydrop = q->stats.earlydrop, .penaltydrop = q->stats.penaltydrop, .bucketdrop = q->stats.bucketdrop, .queuedrop = q->stats.queuedrop, .childdrop = q->stats.childdrop, .marked = q->stats.marked, }; st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); return gnet_stats_copy_app(d, &st, sizeof(st)); } static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { return -ENOSYS; } static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); return 0; } static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) { struct sfb_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long sfb_find(struct Qdisc *sch, u32 classid) { return 1; } static void sfb_unbind(struct Qdisc *sch, unsigned long arg) { } static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { return -ENOSYS; } static int sfb_delete(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { return -ENOSYS; } static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { tc_qdisc_stats_dump(sch, 1, walker); } } static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static const struct Qdisc_class_ops sfb_class_ops = { .graft = sfb_graft, .leaf = sfb_leaf, .find = sfb_find, .change = sfb_change_class, .delete = sfb_delete, .walk = sfb_walk, .tcf_block = sfb_tcf_block, .bind_tcf = sfb_bind, .unbind_tcf = sfb_unbind, .dump = sfb_dump_class, }; static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { .id = "sfb", .priv_size = sizeof(struct sfb_sched_data), .cl_ops = &sfb_class_ops, .enqueue = sfb_enqueue, .dequeue = sfb_dequeue, .peek = sfb_peek, .init = sfb_init, .reset = sfb_reset, .destroy = sfb_destroy, .change = sfb_change, .dump = sfb_dump, .dump_stats = sfb_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("sfb"); static int __init sfb_module_init(void) { return register_qdisc(&sfb_qdisc_ops); } static void __exit sfb_module_exit(void) { unregister_qdisc(&sfb_qdisc_ops); } module_init(sfb_module_init) module_exit(sfb_module_exit) MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); MODULE_AUTHOR("Juliusz Chroboczek"); MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL");
8 8 43 44 44 44 500 502 501 501 500 499 501 502 500 501 492 8 493 494 8 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 // SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "CRED: " fmt #include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> #include <linux/binfmts.h> #include <linux/cn_proc.h> #include <linux/uidgid.h> #if 0 #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__) #else #define kdebug(FMT, ...) \ do { \ if (0) \ no_printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__); \ } while (0) #endif static struct kmem_cache *cred_jar; /* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); kdebug("put_cred_rcu(%p)", cred); if (atomic_long_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %ld\n", cred, atomic_long_read(&cred->usage)); security_cred_free(cred); key_put(cred->session_keyring); key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); if (cred->ucounts) put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } /** * __put_cred - Destroy a set of credentials * @cred: The record to release * * Destroy a set of credentials on which no references remain. */ void __put_cred(struct cred *cred) { kdebug("__put_cred(%p{%ld})", cred, atomic_long_read(&cred->usage)); BUG_ON(atomic_long_read(&cred->usage) != 0); BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); if (cred->non_rcu) put_cred_rcu(&cred->rcu); else call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); /* * Clean up a task's credentials when it exits */ void exit_creds(struct task_struct *tsk) { struct cred *real_cred, *cred; kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred, atomic_long_read(&tsk->cred->usage)); real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; cred = (struct cred *) tsk->cred; tsk->cred = NULL; if (real_cred == cred) { put_cred_many(cred, 2); } else { put_cred(real_cred); put_cred(cred); } #ifdef CONFIG_KEYS_REQUEST_CACHE key_put(tsk->cached_requested_key); tsk->cached_requested_key = NULL; #endif } /** * get_task_cred - Get another task's objective credentials * @task: The task to query * * Get the objective credentials of a task, pinning them so that they can't go * away. Accessing a task's credentials directly is not permitted. * * The caller must also make sure task doesn't get deleted, either by holding a * ref on task or by holding tasklist_lock to prevent it from being unlinked. */ const struct cred *get_task_cred(struct task_struct *task) { const struct cred *cred; rcu_read_lock(); do { cred = __task_cred((task)); BUG_ON(!cred); } while (!get_cred_rcu(cred)); rcu_read_unlock(); return cred; } EXPORT_SYMBOL(get_task_cred); /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. */ struct cred *cred_alloc_blank(void) { struct cred *new; new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); if (!new) return NULL; atomic_long_set(&new->usage, 1); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } /** * prepare_creds - Prepare a new set of credentials for modification * * Prepare a new set of task credentials for modification. A task's creds * shouldn't generally be modified directly, therefore this function is used to * prepare a new copy, which the caller then modifies and then commits by * calling commit_creds(). * * Preparation involves making a copy of the objective creds for modification. * * Returns a pointer to the new creds-to-be if successful, NULL otherwise. * * Call commit_creds() or abort_creds() to clean up. */ struct cred *prepare_creds(void) { struct task_struct *task = current; const struct cred *old; struct cred *new; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_creds() alloc %p", new); old = task->cred; memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); #ifdef CONFIG_KEYS key_get(new->session_keyring); key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { struct cred *new; new = prepare_creds(); if (!new) return new; #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; /* inherit the session keyring; new process keyring */ key_put(new->process_keyring); new->process_keyring = NULL; #endif new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; return new; } /* * Copy credentials for the new process created by fork() * * We share if we can, but under some circumstances we have to generate a new * set. * * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; #ifdef CONFIG_KEYS_REQUEST_CACHE p->cached_requested_key = NULL; #endif if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && #endif clone_flags & CLONE_THREAD ) { p->real_cred = get_cred_many(p->cred, 2); kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); get_cred_namespaces(p); return 0; } new = prepare_creds(); if (!new) return -ENOMEM; if (clone_flags & CLONE_NEWUSER) { ret = create_user_ns(new); if (ret < 0) goto error_put; ret = set_cred_ucounts(new); if (ret < 0) goto error_put; } #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ if (new->thread_keyring) { key_put(new->thread_keyring); new->thread_keyring = NULL; if (clone_flags & CLONE_THREAD) install_thread_keyring_to_cred(new); } /* The process keyring is only shared between the threads in a process; * anything outside of those threads doesn't inherit. */ if (!(clone_flags & CLONE_THREAD)) { key_put(new->process_keyring); new->process_keyring = NULL; } #endif p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); get_cred_namespaces(p); return 0; error_put: put_cred(new); return ret; } static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) { const struct user_namespace *set_ns = set->user_ns; const struct user_namespace *subset_ns = subset->user_ns; /* If the two credentials are in the same user namespace see if * the capabilities of subset are a subset of set. */ if (set_ns == subset_ns) return cap_issubset(subset->cap_permitted, set->cap_permitted); /* The credentials are in a different user namespaces * therefore one is a subset of the other only if a set is an * ancestor of subset and set->euid is owner of subset or one * of subsets ancestors. */ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { if ((set_ns == subset_ns->parent) && uid_eq(subset_ns->owner, set->euid)) return true; } return false; } /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned * * Install a new set of credentials to the current task, using RCU to replace * the old set. Both the objective and the subjective credentials pointers are * updated. This function may not be called if the subjective credentials are * in an overridden state. * * This function eats the caller's reference to the new credentials. * * Always returns 0 thus allowing this function to be tail-called at the end * of, say, sys_setgid(). */ int commit_creds(struct cred *new) { struct task_struct *task = current; const struct cred *old = task->real_cred; kdebug("commit_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(task->cred != old); BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ if (!uid_eq(old->euid, new->euid) || !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; /* * If a task drops privileges and becomes nondumpable, * the dumpability change must become visible before * the credential change; otherwise, a __ptrace_may_access() * racing with this change may be able to attach to a task it * shouldn't be able to attach to (as if the task had dropped * privileges without becoming nondumpable). * Pairs with a read barrier in __ptrace_may_access(). */ smp_wmb(); } /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); if (new->user_ns != old->user_ns) switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || !uid_eq(new->euid, old->euid) || !uid_eq(new->suid, old->suid) || !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); if (!gid_eq(new->gid, old->gid) || !gid_eq(new->egid, old->egid) || !gid_eq(new->sgid, old->sgid) || !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ put_cred_many(old, 2); return 0; } EXPORT_SYMBOL(commit_creds); /** * abort_creds - Discard a set of credentials and unlock the current task * @new: The credentials that were going to be applied * * Discard a set of credentials that were under construction and unlock the * current task. */ void abort_creds(struct cred *new) { kdebug("abort_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); /** * cred_fscmp - Compare two credentials with respect to filesystem access. * @a: The first credential * @b: The second credential * * cred_cmp() will return zero if both credentials have the same * fsuid, fsgid, and supplementary groups. That is, if they will both * provide the same access to files based on mode/uid/gid. * If the credentials are different, then either -1 or 1 will * be returned depending on whether @a comes before or after @b * respectively in an arbitrary, but stable, ordering of credentials. * * Return: -1, 0, or 1 depending on comparison */ int cred_fscmp(const struct cred *a, const struct cred *b) { struct group_info *ga, *gb; int g; if (a == b) return 0; if (uid_lt(a->fsuid, b->fsuid)) return -1; if (uid_gt(a->fsuid, b->fsuid)) return 1; if (gid_lt(a->fsgid, b->fsgid)) return -1; if (gid_gt(a->fsgid, b->fsgid)) return 1; ga = a->group_info; gb = b->group_info; if (ga == gb) return 0; if (ga == NULL) return -1; if (gb == NULL) return 1; if (ga->ngroups < gb->ngroups) return -1; if (ga->ngroups > gb->ngroups) return 1; for (g = 0; g < ga->ngroups; g++) { if (gid_lt(ga->gid[g], gb->gid[g])) return -1; if (gid_gt(ga->gid[g], gb->gid[g])) return 1; } return 0; } EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { struct ucounts *new_ucounts, *old_ucounts = new->ucounts; /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; put_ucounts(old_ucounts); return 0; } /* * initialise the credentials stuff */ void __init cred_init(void) { /* allocate a slab in which we can store credentials */ cred_jar = KMEM_CACHE(cred, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } /** * prepare_kernel_cred - Prepare a set of credentials for a kernel service * @daemon: A userspace daemon to be used as a reference * * Prepare a set of credentials for a kernel service. This can then be used to * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * * @daemon is used to provide a base cred, with the security data derived from * that; if this is "&init_task", they'll be set to 0, no groups, full * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * * Returns the new credentials or NULL if out of memory. */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { const struct cred *old; struct cred *new; if (WARN_ON_ONCE(!daemon)) return NULL; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); old = get_task_cred(daemon); *new = *old; new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); #ifdef CONFIG_KEYS new->session_keyring = NULL; new->process_keyring = NULL; new->thread_keyring = NULL; new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); return new; error: put_cred(new); put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); /** * set_security_override - Set the security ID in a set of credentials * @new: The credentials to alter * @secid: The LSM security ID to set * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. */ int set_security_override(struct cred *new, u32 secid) { return security_kernel_act_as(new, secid); } EXPORT_SYMBOL(set_security_override); /** * set_security_override_from_ctx - Set the security ID in a set of credentials * @new: The credentials to alter * @secctx: The LSM security context to generate the security ID from. * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. The * security ID is specified in string form as a security context to be * interpreted by the LSM. */ int set_security_override_from_ctx(struct cred *new, const char *secctx) { u32 secid; int ret; ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); if (ret < 0) return ret; return set_security_override(new, secid); } EXPORT_SYMBOL(set_security_override_from_ctx); /** * set_create_files_as - Set the LSM file create context in a set of credentials * @new: The credentials to alter * @inode: The inode to take the context from * * Change the LSM file creation context in a set of credentials to be the same * as the object context of the specified inode, so that the new inodes have * the same MAC context as that inode. */ int set_create_files_as(struct cred *new, struct inode *inode) { if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as);
39 39 2 1 1 1 3 52 49 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the UDP-Lite (RFC 3828) code. */ #ifndef _UDPLITE_H #define _UDPLITE_H #include <net/ip6_checksum.h> #include <net/udp.h> /* UDP-Lite socket options */ #define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ #define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ extern struct proto udplite_prot; extern struct udp_table udplite_table; /* * Checksum computation is all in software, hence simpler getfrag. */ static __inline__ int udplite_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; } /* * Checksumming routines */ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) { u16 cscov; /* In UDPv4 a zero checksum means that the transmitter generated no * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets * with a zero checksum field are illegal. */ if (uh->check == 0) { net_dbg_ratelimited("UDPLite: zeroed checksum field\n"); return 1; } cscov = ntohs(uh->len); if (cscov == 0) /* Indicates that full coverage is required. */ ; else if (cscov < 8 || cscov > skb->len) { /* * Coverage length violates RFC 3828: log and discard silently. */ net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n", cscov, skb->len); return 1; } else if (cscov < skb->len) { UDP_SKB_CB(skb)->partial_cov = 1; UDP_SKB_CB(skb)->cscov = cscov; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; skb->csum_valid = 0; } return 0; } /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { const int off = skb_transport_offset(skb); const struct sock *sk = skb->sk; int len = skb->len - off; if (udp_test_bit(UDPLITE_SEND_CC, sk)) { u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen); if (pcslen < len) { if (pcslen > 0) len = pcslen; udp_hdr(skb)->len = htons(pcslen); } } skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ return skb_checksum(skb, off, len, 0); } void udplite4_register(void); #endif /* _UDPLITE_H */
21235 21220 9013 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/init.h> #include <linux/export.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/kvm_types.h> #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> #include <linux/static_call.h> #include <asm/cpuid/api.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> #include <asm/i8259.h> #include <asm/msr.h> #include <asm/topology.h> #include <asm/uv/uv.h> #include <asm/sev.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); #define KHZ 1000 /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; static struct clocksource_base art_base_clk = { .id = CSID_X86_ART, }; static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); static int __init tsc_early_khz_setup(char *buf) { return kstrtouint(buf, 0, &tsc_early_khz); } early_param("tsc_early_khz", tsc_early_khz_setup); __always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { preempt_disable_notrace(); __cyc2ns_read(data); } __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } /* * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) * ns = cycles * (ns_per_sec / freq) * ns = cycles * (10^9 / (cpu_khz * 10^3)) * ns = cycles * (10^6 / cpu_khz) * * Then we use scaling math (suggested by george@mvista.com) to get: * ns = cycles * (10^6 * SC / cpu_khz) / SC * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div * into a shift. The larger SC is, the more accurate the conversion, but * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication * (64-bit result) can be used. * * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); return ns; } static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { unsigned long long ns; preempt_disable_notrace(); ns = __cycles_2_ns(cyc); preempt_enable_notrace(); return ns; } static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; ns_now = cycles_2_ns(tsc_now); /* * Compute a new multiplier as per the above comment and ensure our * time function is continuous; see the comment near struct * cyc2ns_data. */ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* * cyc2ns_shift is exported via arch_perf_update_userpage() where it is * not expected to be greater than 31 due to the original published * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ if (data.cyc2ns_shift == 32) { data.cyc2ns_shift = 31; data.cyc2ns_mul >>= 1; } data.cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); c2n = per_cpu_ptr(&cyc2ns, cpu); write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; write_seqcount_latch(&c2n->seq); c2n->data[1] = data; write_seqcount_latch_end(&c2n->seq); } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); if (khz) __set_cyc2ns_scale(khz, cpu, tsc_now); sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* * Initialize cyc2ns for boot cpu */ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { unsigned int cpu, this_cpu = smp_processor_id(); struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); struct cyc2ns_data *data = c2n->data; for_each_possible_cpu(cpu) { if (cpu != this_cpu) { seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; } } } /* * Scheduler clock - returns current time in nanosec units. */ noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); /* return the value in ns */ return __cycles_2_ns(tsc_now); } /* * Fall back to jiffies if there's no TSC available: * ( But note that we still use it if the TSC is marked * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform * can achieve it. ) */ /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* * Generate a sched_clock if you already have a TSC value. */ u64 native_sched_clock_from_tsc(u64 tsc) { return cycles_2_ns(tsc); } /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } bool using_native_sched_clock(void) { return static_call_query(pv_sched_clock) == native_sched_clock; } #else u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif notrace u64 sched_clock(void) { u64 now; preempt_disable_notrace(); now = sched_clock_noinstr(); preempt_enable_notrace(); return now; } int check_tsc_unstable(void) { return tsc_unstable; } EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { mark_tsc_unstable("boot parameter notsc"); return 1; } #else /* * disable flag for tsc. Takes effect by clearing the TSC cpu flag * in cpu/common.c */ int __init notsc_setup(char *str) { setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif __setup("notsc", notsc_setup); static int no_sched_irq_time; static int no_tsc_watchdog; static int tsc_as_watchdog; static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); if (!strcmp(str, "nowatchdog")) { no_tsc_watchdog = 1; if (tsc_as_watchdog) pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", __func__); tsc_as_watchdog = 0; } if (!strcmp(str, "recalibrate")) tsc_force_recalibrate = 1; if (!strcmp(str, "watchdog")) { if (no_tsc_watchdog) pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", __func__); else tsc_as_watchdog = 1; } return 1; } __setup("tsc=", tsc_setup); #define MAX_RETRIES 5 #define TSC_DEFAULT_THRESHOLD 0x20000 /* * Read TSC and the reference counters. Take care of any disturbances */ static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; int i; for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < thresh) return t2; } return ULLONG_MAX; } /* * Calculate the TSC frequency from HPET reference */ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) { u64 tmp; if (hpet2 < hpet1) hpet2 += 0x100000000ULL; hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } /* * Calculate the TSC frequency from PMTimer reference */ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) { u64 tmp; if (!pm1 && !pm2) return ULONG_MAX; if (pm2 < pm1) pm2 += (u64)ACPI_PM_OVRRUN; pm2 -= pm1; tmp = pm2 * 1000000000LL; do_div(tmp, PMTMR_TICKS_PER_SEC); do_div(deltatsc, tmp); return (unsigned long) deltatsc; } #define CAL_MS 10 #define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 #define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 /* * Try to calibrate the TSC against the Programmable * Interrupt Timer and return the frequency of the TSC * in kHz. * * Return ULONG_MAX on failure to calibrate. */ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) { u64 tsc, t1, t2, delta; unsigned long tscmin, tscmax; int pitcnt; if (!has_legacy_pic()) { /* * Relies on tsc_early_delay_calibrate() to have given us semi * usable udelay(), wait for the same 50ms we would have with * the PIT loop below. */ udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); return ULONG_MAX; } /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Setup CTC channel 2* for mode 0, (interrupt on terminal * count mode), binary count. Set the latch register to 50ms * (LSB then MSB) to begin countdown. */ outb(0xb0, 0x43); outb(latch & 0xff, 0x42); outb(latch >> 8, 0x42); tsc = t1 = t2 = get_cycles(); pitcnt = 0; tscmax = 0; tscmin = ULONG_MAX; while ((inb(0x61) & 0x20) == 0) { t2 = get_cycles(); delta = t2 - tsc; tsc = t2; if ((unsigned long) delta < tscmin) tscmin = (unsigned int) delta; if ((unsigned long) delta > tscmax) tscmax = (unsigned int) delta; pitcnt++; } /* * Sanity checks: * * If we were not able to read the PIT more than loopmin * times, then we have been hit by a massive SMI * * If the maximum is 10 times larger than the minimum, * then we got hit by an SMI as well. */ if (pitcnt < loopmin || tscmax > 10 * tscmin) return ULONG_MAX; /* Calculate the PIT value */ delta = t2 - t1; do_div(delta, ms); return delta; } /* * This reads the current MSB of the PIT counter, and * checks if we are running on sufficiently fast and * non-virtualized hardware. * * Our expectations are: * * - the PIT is running at roughly 1.19MHz * * - each IO is going to take about 1us on real hardware, * but we allow it to be much faster (by a factor of 10) or * _slightly_ slower (ie we allow up to a 2us read+counter * update - anything else implies a unacceptably slow CPU * or PIT for the fast calibration to work. * * - with 256 PIT ticks to read the value, we have 214us to * see the same MSB (and overhead like doing a single TSC * read per MSB value etc). * * - We're doing 2 reads per loop (LSB, MSB), and we expect * them each to take about a microsecond on real hardware. * So we expect a count value of around 100. But we'll be * generous, and accept anything over 50. * * - if the PIT is stuck, and we see *many* more reads, we * return early (and the next caller of pit_expect_msb() * then consider it a failure when they don't see the * next expected value). * * These expectations mean that we know that we have seen the * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { /* Ignore LSB */ inb(0x42); return inb(0x42) == val; } static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; prev_tsc = tsc; tsc = get_cycles(); } *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* * We require _some_ success, but the quality control * will be based on the error terms on the TSC values. */ return count > 5; } /* * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend * more than 50ms on it. */ #define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) { int i; u64 tsc, delta; unsigned long d1, d2; if (!has_legacy_pic()) return 0; /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Counter 2, mode 0 (one-shot), binary count * * NOTE! Mode 2 decrements by two (and then the * output is flipped each time, giving the same * final output frequency as a decrement-by-one), * so mode 0 is much better when looking at the * individual counts. */ outb(0xb0, 0x43); /* Start at 0xffff */ outb(0xff, 0x42); outb(0xff, 0x42); /* * The PIT starts counting at the next edge, so we * need to delay for a microsecond. The easiest way * to do that is to just read back the 16-bit counter * once from the PIT. */ pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { if (!pit_expect_msb(0xff-i, &delta, &d2)) break; delta -= tsc; /* * Extrapolate the error and fail fast if the error will * never be below 500 ppm. */ if (i == 1 && d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) return 0; /* * Iterate until the error is less than 500 ppm */ if (d1+d2 >= delta >> 11) continue; /* * Check the PIT one more time to verify that * all TSC reads were stable wrt the PIT. * * This also guarantees serialization of the * last cycle read ('d2') in pit_expect_msb. */ if (!pit_verify_msb(0xfe - i)) break; goto success; } } pr_info("Fast TSC calibration failed\n"); return 0; success: /* * Ok, if we get here, then we've seen the * MSB of the PIT decrement 'i' times, and the * error has shrunk to less than 500 ppm. * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); pr_info("Fast TSC calibration using PIT\n"); return delta; } /** * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) { unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; unsigned int crystal_khz; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; crystal_khz = ecx_hz / 1000; /* * Denverton SoCs don't report crystal clock, and also don't support * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz * crystal clock. */ if (crystal_khz == 0 && boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) crystal_khz = 25000; /* * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ if (crystal_khz != 0) setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); /* * Some Intel SoCs like Skylake and Kabylake don't report the crystal * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) { unsigned int eax_base_mhz, ebx, ecx, edx; cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } if (crystal_khz == 0) return 0; /* * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC /* * The local APIC appears to be fed by the core crystal clock * (which sounds entirely sensible). We can set the global * lapic_timer_period here to avoid having to calibrate the APIC * timer later. */ lapic_timer_period = crystal_khz * 1000 / HZ; #endif return crystal_khz * ebx_numerator / eax_denominator; } static unsigned long cpu_khz_from_cpuid(void) { unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } /* * calibrate cpu using pit, hpet, and ptimer methods. They are available * later in boot after acpi is initialized. */ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes * here: * * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and * load a timeout of 50ms. We read the time right after we * started the timer and wait until the PIT count down reaches * zero. In each wait loop iteration we read the TSC and check * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when * any disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. */ /* Preset PIT loop values */ latch = CAL_LATCH; ms = CAL_MS; loopmin = CAL_PIT_LOOPS; for (i = 0; i < 3; i++) { unsigned long tsc_pit_khz; /* * Read the start value and the reference count of * hpet/pmtimer when available. Then do the PIT * calibration, which will take at least 50ms, and * read the end value. */ local_irq_save(flags); tsc1 = tsc_read_refs(&ref1, hpet); tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); tsc2 = tsc_read_refs(&ref2, hpet); local_irq_restore(flags); /* Pick the lowest PIT TSC calibration so far */ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ if (ref1 == ref2) continue; /* Check, whether the sampling was disturbed */ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) continue; tsc2 = (tsc2 - tsc1) * 1000000LL; if (hpet) tsc2 = calc_hpet_ref(tsc2, ref1, ref2); else tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); /* Check the reference deviation */ delta = ((u64) tsc_pit_min) * 100; do_div(delta, tsc_ref_min); /* * If both calibration results are inside a 10% window * then we can be sure, that the calibration * succeeded. We break out of the loop right away. We * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { pr_info("PIT calibration matches %s. %d loops\n", hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } /* * Check whether PIT failed more than once. This * happens in virtualized environments. We need to * give the virtual PC a slightly longer timeframe for * the HPET/PMTIMER to make the result precise. */ if (i == 1 && tsc_pit_min == ULONG_MAX) { latch = CAL2_LATCH; ms = CAL2_MS; loopmin = CAL2_PIT_LOOPS; } } /* * Now check the results. */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ pr_info("using %s reference calibration\n", hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } /* * The calibration values differ too much. In doubt, we use * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ pr_warn("PIT calibration deviates from %s: %lu %lu\n", hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /** * native_calibrate_cpu_early - can calibrate the cpu early in boot */ unsigned long native_calibrate_cpu_early(void) { unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); if (!fast_calibrate) fast_calibrate = cpu_khz_from_msr(); if (!fast_calibrate) { local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); } return fast_calibrate; } /** * native_calibrate_cpu - calibrate the cpu */ static unsigned long native_calibrate_cpu(void) { unsigned long tsc_freq = native_calibrate_cpu_early(); if (!tsc_freq) tsc_freq = pit_hpet_ptimer_calibrate_cpu(); return tsc_freq; } void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); #endif } EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); } /* * Even on processors with invariant TSC, TSC gets reset in some the * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to * arbitrary value (still sync'd across cpu's) during resume from such sleep * states. To cope up with this, recompute the cyc2ns_offset for each cpu so * that sched_clock() continues from the point where it was left off during * suspend. */ void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; int cpu; if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; local_irq_save(flags); /* * We're coming out of suspend, there's no concurrency yet; don't * bother being nice about the RCU stuff, just write to both * data fields. */ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); offset = cyc2ns_suspend - sched_clock(); for_each_possible_cpu(cpu) { per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; } local_irq_restore(flags); } #ifdef CONFIG_CPU_FREQ /* * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. */ static unsigned int ref_freq; static unsigned long loops_per_jiffy_ref; static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; if (num_online_cpus() > 1) { mark_tsc_unstable("cpufreq changes on SMP"); return 0; } if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { boot_cpu_data.loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc()); } return 0; } static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; static int __init cpufreq_register_tsc_scaling(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; } core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ #define ART_MIN_DENOMINATOR (1) /* * If ART is present detect the numerator:denominator to convert to TSC */ static void __init detect_art(void) { unsigned int unused; if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return; /* * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, * and the TSC counter resets must not occur asynchronously. */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || tsc_async_resets) return; cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator, &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); art_base_clk.freq_khz /= KHZ; if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } /* clocksource code */ static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); } /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm * timer. * * This sanity check is now done in the core timekeeping code. * checking the result of read_tsc() - cycle_last for being negative. * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ static u64 read_tsc(struct clocksource *cs) { return (u64)rdtsc_ordered(); } static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } static void tsc_cs_tick_stable(struct clocksource *cs) { if (tsc_unstable) return; if (using_native_sched_clock()) sched_clock_tick_stable(); } static int tsc_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_TSC); return 0; } /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .id = CSID_X86_TSC_EARLY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc_early.list), }; /* * Must mark VALID_FOR_HRES early such that when we unregister tsc_early * this one will immediately take over. We will only register if TSC has * been found good. */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc.list), }; void mark_tsc_unstable(char *reason) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); clocksource_mark_unstable(&clocksource_tsc_early); clocksource_mark_unstable(&clocksource_tsc); } EXPORT_SYMBOL_GPL(mark_tsc_unstable); static void __init tsc_disable_clocksource_watchdog(void) { clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } bool tsc_clocksource_watchdog_disabled(void) { return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && tsc_as_watchdog && !no_tsc_watchdog; } static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) if (is_geode_lx()) { /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; } #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; /* * Disable the clocksource watchdog when the system has: * - TSC running at constant frequency * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications * - not more than four packages */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && topology_max_packages() <= 4) tsc_disable_clocksource_watchdog(); } /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ int unsynchronized_tsc(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; #endif if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; if (tsc_clocksource_reliable) return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (topology_max_packages() > 1) return 1; } return 0; } static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is * timer based, instead of loop based, we don't block the boot * process while this longer calibration is done. * * If there are any calibration anomalies (too many SMIs, etc), * or the refined calibration is off by 1% of the fast early * calibration, we throw out the new calibration and use the * early calibration. */ static void tsc_refine_calibration_work(struct work_struct *work) { static u64 tsc_start = ULLONG_MAX, ref_start; static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; int cpu; /* Don't bother refining TSC on unstable systems */ if (tsc_unstable) goto unreg; /* * Since the work is started early in boot, we may be * delayed the first time we expire. So set the workqueue * again once we know timers are working. */ if (tsc_start == ULLONG_MAX) { restart: /* * Only set hpet once, to avoid mixing hardware * if the hpet becomes enabled later. */ hpet = is_hpet_enabled(); tsc_start = tsc_read_refs(&ref_start, hpet); schedule_delayed_work(&tsc_irqwork, HZ); return; } tsc_stop = tsc_read_refs(&ref_stop, hpet); /* hpet or pmtimer available ? */ if (ref_start == ref_stop) goto out; /* Check, whether the sampling was disturbed */ if (tsc_stop == ULLONG_MAX) goto restart; delta = tsc_stop - tsc_start; delta *= 1000000LL; if (hpet) freq = calc_hpet_ref(delta, ref_start, ref_stop); else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); /* Will hit this only if tsc_force_recalibrate has been set */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { /* Warn if the deviation exceeds 500 ppm */ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); } pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", hpet ? "HPET" : "PM_TIMER", (unsigned long)freq / 1000, (unsigned long)freq % 1000); return; } /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; tsc_khz = freq; pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); /* Update the sched_clock() rate to match the clocksource one */ for_each_possible_cpu(cpu) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: if (tsc_unstable) goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; clocksource_tsc.base = &art_base_clk; } clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); } static int __init init_tsc_clocksource(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) { clocksource_unregister(&clocksource_tsc_early); return 0; } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* * When TSC frequency is known (retrieved via MSR or CPUID), we skip * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; clocksource_tsc.base = &art_base_clk; } clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); if (!tsc_force_recalibrate) return 0; } schedule_delayed_work(&tsc_irqwork, 0); return 0; } /* * We use device_initcall here, to ensure we run after the hpet * is fully initialized, which may occur at fs_initcall time. */ device_initcall(init_tsc_clocksource); static bool __init determine_cpu_tsc_frequencies(bool early) { /* Make sure that cpu and tsc are not already calibrated */ WARN_ON(cpu_khz || tsc_khz); if (early) { cpu_khz = x86_platform.calibrate_cpu(); if (tsc_early_khz) { tsc_khz = tsc_early_khz; } else { tsc_khz = x86_platform.calibrate_tsc(); clocksource_tsc.freq_khz = tsc_khz; } } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } /* * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; if (tsc_khz == 0) return false; pr_info("Detected %lu.%03lu MHz processor\n", (unsigned long)cpu_khz / KHZ, (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", (unsigned long)tsc_khz / KHZ, (unsigned long)tsc_khz % KHZ); } return true; } static unsigned long __init get_loops_per_jiffy(void) { u64 lpj = (u64)tsc_khz * KHZ; do_div(lpj, HZ); return lpj; } static void __init tsc_enable_sched_clock(void) { loops_per_jiffy = get_loops_per_jiffy(); use_tsc_delay(); /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); static_branch_enable(&__use_tsc); } void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; snp_secure_tsc_init(); if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); } void __init tsc_init(void) { if (!cpu_feature_enabled(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } /* * native_calibrate_cpu_early can only calibrate using methods that are * available early in boot. */ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) x86_platform.calibrate_cpu = native_calibrate_cpu; if (!tsc_khz) { /* We failed to determine frequencies earlier, try again */ if (!determine_cpu_tsc_frequencies(false)) { mark_tsc_unstable("could not calculate TSC khz"); setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } tsc_enable_sched_clock(); } cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); lpj_fine = get_loops_per_jiffy(); check_system_tsc_reliable(); if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); return; } if (tsc_clocksource_reliable || no_tsc_watchdog) tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } #ifdef CONFIG_SMP /* * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); /* * If TSC has constant frequency and TSC is synchronized across * sockets then reuse CPU0 calibration. */ if (constant_tsc && !tsc_unstable) return cpu_data(0).loops_per_jiffy; /* * If TSC has constant frequency and TSC is not synchronized across * sockets and this is not the first CPU in the socket, then reuse * the calibration value of an already online CPU on that socket. * * This assumes that CONSTANT_TSC is consistent for all CPUs in a * socket. */ if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; return 0; } #endif
5 5 5 5 2 4 1 2 3 27 27 27 8 27 13 13 729 5 2 728 4 720 720 5 5 5 5 1 4 3 1 1 3 8 8 8 8 8 8 8 596 596 169 224 542 542 384 384 43 276 1238 542 542 1234 680 630 277 173 43 193 39 167 55 55 55 1 1 18 4 3 4 2 3 3 3 3 3 21 2 2 2 2 1 1 1 1 9 9 5 1 4 4 17 14 3 10 10 10 10 10 10 1 9 10 10 9 1 6 1 1 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/sysctl.h> #include <linux/net.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/mpls.h> #include <linux/netconf.h> #include <linux/nospec.h> #include <linux/vmalloc.h> #include <linux/percpu.h> #include <net/gso.h> #include <net/ip.h> #include <net/dst.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/netevent.h> #include <net/ip_tunnels.h> #include <net/netns/generic.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #endif #include <net/ipv6_stubs.h> #include <net/rtnh.h> #include "internal.h" /* max memory we will use for mpls_route */ #define MAX_MPLS_ROUTE_MEM 4096 /* Maximum number of labels to look ahead at when selecting a path of * a multipath route */ #define MAX_MP_SELECT_LABELS 4 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) static int label_limit = (1 << 20) - 1; static int ttl_max = 255; #if IS_ENABLED(CONFIG_NET_IP_TUNNEL) static size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct mpls_shim_hdr); } static const struct ip_tunnel_encap_ops mpls_iptun_ops = { .encap_hlen = ipgre_mpls_encap_hlen, }; static int ipgre_tunnel_encap_add_mpls_ops(void) { return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); } static void ipgre_tunnel_encap_del_mpls_ops(void) { ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); } #else static int ipgre_tunnel_encap_add_mpls_ops(void) { return 0; } static void ipgre_tunnel_encap_del_mpls_ops(void) { } #endif static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); static struct mpls_route *mpls_route_input(struct net *net, unsigned int index) { struct mpls_route __rcu **platform_label; platform_label = mpls_dereference(net, net->mpls.platform_label); return mpls_dereference(net, platform_label[index]); } static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned int index) { struct mpls_route __rcu **platform_label; if (index >= net->mpls.platform_labels) return NULL; platform_label = rcu_dereference(net->mpls.platform_label); return rcu_dereference(platform_label[index]); } bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); } EXPORT_SYMBOL_GPL(mpls_output_possible); static u8 *__mpls_nh_via(struct mpls_route *rt, struct mpls_nh *nh) { return (u8 *)nh + rt->rt_via_offset; } static const u8 *mpls_nh_via(const struct mpls_route *rt, const struct mpls_nh *nh) { return __mpls_nh_via((struct mpls_route *)rt, (struct mpls_nh *)nh); } static unsigned int mpls_nh_header_size(const struct mpls_nh *nh) { /* The size of the layer 2.5 labels to be added for this route */ return nh->nh_labels * sizeof(struct mpls_shim_hdr); } unsigned int mpls_dev_mtu(const struct net_device *dev) { /* The amount of data the layer 2 frame can hold */ return dev->mtu; } EXPORT_SYMBOL_GPL(mpls_dev_mtu); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); void mpls_stats_inc_outucastpkts(struct net *net, struct net_device *dev, const struct sk_buff *skb) { struct mpls_dev *mdev; if (skb->protocol == htons(ETH_P_MPLS_UC)) { mdev = mpls_dev_rcu(dev); if (mdev) MPLS_INC_STATS_LEN(mdev, skb->len, tx_packets, tx_bytes); } else if (skb->protocol == htons(ETH_P_IP)) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { struct inet6_dev *in6dev = in6_dev_rcu(dev); if (in6dev) IP6_UPD_PO_STATS(net, in6dev, IPSTATS_MIB_OUT, skb->len); #endif } } EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts); static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) { struct mpls_entry_decoded dec; unsigned int mpls_hdr_len = 0; struct mpls_shim_hdr *hdr; bool eli_seen = false; int label_index; u32 hash = 0; for (label_index = 0; label_index < MAX_MP_SELECT_LABELS; label_index++) { mpls_hdr_len += sizeof(*hdr); if (!pskb_may_pull(skb, mpls_hdr_len)) break; /* Read and decode the current label */ hdr = mpls_hdr(skb) + label_index; dec = mpls_entry_decode(hdr); /* RFC6790 - reserved labels MUST NOT be used as keys * for the load-balancing function */ if (likely(dec.label >= MPLS_LABEL_FIRST_UNRESERVED)) { hash = jhash_1word(dec.label, hash); /* The entropy label follows the entropy label * indicator, so this means that the entropy * label was just added to the hash - no need to * go any deeper either in the label stack or in the * payload */ if (eli_seen) break; } else if (dec.label == MPLS_LABEL_ENTROPY) { eli_seen = true; } if (!dec.bos) continue; /* found bottom label; does skb have room for a header? */ if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) { const struct iphdr *v4hdr; v4hdr = (const struct iphdr *)(hdr + 1); if (v4hdr->version == 4) { hash = jhash_3words(ntohl(v4hdr->saddr), ntohl(v4hdr->daddr), v4hdr->protocol, hash); } else if (v4hdr->version == 6 && pskb_may_pull(skb, mpls_hdr_len + sizeof(struct ipv6hdr))) { const struct ipv6hdr *v6hdr; v6hdr = (const struct ipv6hdr *)(hdr + 1); hash = __ipv6_addr_jhash(&v6hdr->saddr, hash); hash = __ipv6_addr_jhash(&v6hdr->daddr, hash); hash = jhash_1word(v6hdr->nexthdr, hash); } } break; } return hash; } static struct mpls_nh *mpls_get_nexthop(struct mpls_route *rt, u8 index) { return (struct mpls_nh *)((u8 *)rt->rt_nh + index * rt->rt_nh_size); } /* number of alive nexthops (rt->rt_nhn_alive) and the flags for * a next hop (nh->nh_flags) are modified by netdev event handlers. * Since those fields can change at any moment, use READ_ONCE to * access both. */ static const struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, struct sk_buff *skb) { u32 hash = 0; int nh_index = 0; int n = 0; u8 alive; /* No need to look further into packet if there's only * one path */ if (rt->rt_nhn == 1) return rt->rt_nh; alive = READ_ONCE(rt->rt_nhn_alive); if (alive == 0) return NULL; hash = mpls_multipath_hash(rt, skb); nh_index = hash % alive; if (alive == rt->rt_nhn) goto out; for_nexthops(rt) { unsigned int nh_flags = READ_ONCE(nh->nh_flags); if (nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) continue; if (n == nh_index) return nh; n++; } endfor_nexthops(rt); out: return mpls_get_nexthop(rt, nh_index); } static bool mpls_egress(struct net *net, struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) { enum mpls_payload_type payload_type; bool success = false; /* The IPv4 code below accesses through the IPv4 header * checksum, which is 12 bytes into the packet. * The IPv6 code below accesses through the IPv6 hop limit * which is 8 bytes into the packet. * * For all supported cases there should always be at least 12 * bytes of packet data present. The IPv4 header is 20 bytes * without options and the IPv6 header is always 40 bytes * long. */ if (!pskb_may_pull(skb, 12)) return false; payload_type = rt->rt_payload_type; if (payload_type == MPT_UNSPEC) payload_type = ip_hdr(skb)->version; switch (payload_type) { case MPT_IPV4: { struct iphdr *hdr4 = ip_hdr(skb); u8 new_ttl; skb->protocol = htons(ETH_P_IP); /* If propagating TTL, take the decremented TTL from * the incoming MPLS header, otherwise decrement the * TTL, but only if not 0 to avoid underflow. */ if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && net->mpls.ip_ttl_propagate)) new_ttl = dec.ttl; else new_ttl = hdr4->ttl ? hdr4->ttl - 1 : 0; csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), htons(new_ttl << 8)); hdr4->ttl = new_ttl; success = true; break; } case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); /* If propagating TTL, take the decremented TTL from * the incoming MPLS header, otherwise decrement the * hop limit, but only if not 0 to avoid underflow. */ if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && net->mpls.ip_ttl_propagate)) hdr6->hop_limit = dec.ttl; else if (hdr6->hop_limit) hdr6->hop_limit = hdr6->hop_limit - 1; success = true; break; } case MPT_UNSPEC: /* Should have decided which protocol it is by now */ break; } return success; } static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net_rcu(dev); struct mpls_shim_hdr *hdr; const struct mpls_nh *nh; struct mpls_route *rt; struct mpls_entry_decoded dec; struct net_device *out_dev; struct mpls_dev *out_mdev; struct mpls_dev *mdev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; int err; /* Careful this entire function runs inside of an rcu critical section */ mdev = mpls_dev_rcu(dev); if (!mdev) goto drop; MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets, rx_bytes); if (!mdev->input_enabled) { MPLS_INC_STATS(mdev, rx_dropped); goto drop; } if (skb->pkt_type != PACKET_HOST) goto err; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto err; if (!pskb_may_pull(skb, sizeof(*hdr))) goto err; skb_dst_drop(skb); /* Read and decode the label */ hdr = mpls_hdr(skb); dec = mpls_entry_decode(hdr); rt = mpls_route_input_rcu(net, dec.label); if (!rt) { MPLS_INC_STATS(mdev, rx_noroute); goto drop; } nh = mpls_select_multipath(rt, skb); if (!nh) goto err; /* Pop the label */ skb_pull(skb, sizeof(*hdr)); skb_reset_network_header(skb); skb_orphan(skb); if (skb_warn_if_lro(skb)) goto err; skb_forward_csum(skb); /* Verify ttl is valid */ if (dec.ttl <= 1) goto err; /* Find the output device */ out_dev = nh->nh_dev; if (!mpls_output_possible(out_dev)) goto tx_err; /* Verify the destination can hold the packet */ new_header_size = mpls_nh_header_size(nh); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto tx_err; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ if (skb_cow(skb, hh_len + new_header_size)) goto tx_err; skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(net, rt, skb, dec)) goto err; } else { bool bos; int i; skb_push(skb, new_header_size); skb_reset_network_header(skb); /* Push the new labels */ hdr = mpls_hdr(skb); bos = dec.bos; for (i = nh->nh_labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(nh->nh_label[i], dec.ttl, 0, bos); bos = false; } } mpls_stats_inc_outucastpkts(net, out_dev, skb); /* If via wasn't specified then send out using device address */ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, out_dev->dev_addr, skb); else err = neigh_xmit(nh->nh_via_table, out_dev, mpls_nh_via(rt, nh), skb); if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); return 0; tx_err: out_mdev = out_dev ? mpls_dev_rcu(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); goto drop; err: MPLS_INC_STATS(mdev, rx_errors); drop: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type mpls_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), .func = mpls_forward, }; static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_TTL_PROPAGATE] = { .type = NLA_U8 }, }; struct mpls_route_config { u32 rc_protocol; u32 rc_ifindex; u8 rc_via_table; u8 rc_via_alen; u8 rc_via[MAX_VIA_ALEN]; u32 rc_label; u8 rc_ttl_propagate; u8 rc_output_labels; u32 rc_output_label[MAX_NEW_LABELS]; u32 rc_nlflags; enum mpls_payload_type rc_payload_type; struct nl_info rc_nlinfo; struct rtnexthop *rc_mp; int rc_mp_len; }; /* all nexthops within a route have the same size based on max * number of labels and max via length for a hop */ static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen, u8 max_labels) { u8 nh_size = MPLS_NH_SIZE(max_labels, max_alen); struct mpls_route *rt; size_t size; size = sizeof(*rt) + num_nh * nh_size; if (size > MAX_MPLS_ROUTE_MEM) return ERR_PTR(-EINVAL); rt = kzalloc(size, GFP_KERNEL); if (!rt) return ERR_PTR(-ENOMEM); rt->rt_nhn = num_nh; rt->rt_nhn_alive = num_nh; rt->rt_nh_size = nh_size; rt->rt_via_offset = MPLS_NH_VIA_OFF(max_labels); return rt; } static void mpls_rt_free_rcu(struct rcu_head *head) { struct mpls_route *rt; rt = container_of(head, struct mpls_route, rt_rcu); change_nexthops(rt) { netdev_put(nh->nh_dev, &nh->nh_dev_tracker); } endfor_nexthops(rt); kfree(rt); } static void mpls_rt_free(struct mpls_route *rt) { if (rt) call_rcu(&rt->rt_rcu, mpls_rt_free_rcu); } static void mpls_notify_route(struct net *net, unsigned index, struct mpls_route *old, struct mpls_route *new, const struct nl_info *info) { struct nlmsghdr *nlh = info ? info->nlh : NULL; unsigned portid = info ? info->portid : 0; int event = new ? RTM_NEWROUTE : RTM_DELROUTE; struct mpls_route *rt = new ? new : old; unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; /* Ignore reserved labels for now */ if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED)) rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); } static void mpls_route_update(struct net *net, unsigned index, struct mpls_route *new, const struct nl_info *info) { struct mpls_route __rcu **platform_label; struct mpls_route *rt; platform_label = mpls_dereference(net, net->mpls.platform_label); rt = mpls_dereference(net, platform_label[index]); rcu_assign_pointer(platform_label[index], new); mpls_notify_route(net, index, rt, new, info); /* If we removed a route free it now */ mpls_rt_free(rt); } static unsigned int find_free_label(struct net *net) { unsigned int index; for (index = MPLS_LABEL_FIRST_UNRESERVED; index < net->mpls.platform_labels; index++) { if (!mpls_route_input(net, index)) return index; } return LABEL_NOT_SPECIFIED; } #if IS_ENABLED(CONFIG_INET) static struct net_device *inet_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { struct net_device *dev; struct rtable *rt; struct in_addr daddr; memcpy(&daddr, addr, sizeof(struct in_addr)); rt = ip_route_output(net, daddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); ip_rt_put(rt); return dev; } #else static struct net_device *inet_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); } #endif #if IS_ENABLED(CONFIG_IPV6) static struct net_device *inet6_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; if (!ipv6_stub) return ERR_PTR(-EAFNOSUPPORT); memset(&fl6, 0, sizeof(fl6)); memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) return ERR_CAST(dst); dev = dst->dev; netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); dst_release(dst); return dev; } #else static struct net_device *inet6_fib_lookup_dev(struct net *net, struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); } #endif static struct net_device *find_outdev(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif) { struct net_device *dev = NULL; if (!oif) { switch (nh->nh_via_table) { case NEIGH_ARP_TABLE: dev = inet_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_ND_TABLE: dev = inet6_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_LINK_TABLE: break; } } else { dev = netdev_get_by_index(net, oif, &nh->nh_dev_tracker, GFP_KERNEL); } if (!dev) return ERR_PTR(-ENODEV); if (IS_ERR(dev)) return dev; nh->nh_dev = dev; return dev; } static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif) { struct net_device *dev = NULL; int err = -ENODEV; dev = find_outdev(net, rt, nh, oif); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto errout; } /* Ensure this is a supported device */ err = -EINVAL; if (!mpls_dev_get(net, dev)) goto errout_put; if ((nh->nh_via_table == NEIGH_LINK_TABLE) && (dev->addr_len != nh->nh_via_alen)) goto errout_put; if (!(dev->flags & IFF_UP)) { nh->nh_flags |= RTNH_F_DEAD; } else { unsigned int flags; flags = netif_get_flags(dev); if (!(flags & (IFF_RUNNING | IFF_LOWER_UP))) nh->nh_flags |= RTNH_F_LINKDOWN; } return 0; errout_put: netdev_put(nh->nh_dev, &nh->nh_dev_tracker); nh->nh_dev = NULL; errout: return err; } static int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, u8 via_addr[], struct netlink_ext_ack *extack) { struct rtvia *via = nla_data(nla); int err = -EINVAL; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid attribute length for RTA_VIA"); goto errout; } alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); if (alen > MAX_VIA_ALEN) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid address length for RTA_VIA"); goto errout; } /* Validate the address family */ switch (via->rtvia_family) { case AF_PACKET: *via_table = NEIGH_LINK_TABLE; break; case AF_INET: *via_table = NEIGH_ARP_TABLE; if (alen != 4) goto errout; break; case AF_INET6: *via_table = NEIGH_ND_TABLE; if (alen != 16) goto errout; break; default: /* Unsupported address family */ goto errout; } memcpy(via_addr, via->rtvia_addr, alen); *via_alen = alen; err = 0; errout: return err; } static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, struct mpls_route *rt) { struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_nh *nh = rt->rt_nh; int err; int i; if (!nh) return -ENOMEM; nh->nh_labels = cfg->rc_output_labels; for (i = 0; i < nh->nh_labels; i++) nh->nh_label[i] = cfg->rc_output_label[i]; nh->nh_via_table = cfg->rc_via_table; memcpy(__mpls_nh_via(rt, nh), cfg->rc_via, cfg->rc_via_alen); nh->nh_via_alen = cfg->rc_via_alen; err = mpls_nh_assign_dev(net, rt, nh, cfg->rc_ifindex); if (err) goto errout; if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) rt->rt_nhn_alive--; return 0; errout: return err; } static int mpls_nh_build(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif, struct nlattr *via, struct nlattr *newdst, u8 max_labels, struct netlink_ext_ack *extack) { int err = -ENOMEM; if (!nh) goto errout; if (newdst) { err = nla_get_labels(newdst, max_labels, &nh->nh_labels, nh->nh_label, extack); if (err) goto errout; } if (via) { err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table, __mpls_nh_via(rt, nh), extack); if (err) goto errout; } else { nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC; } err = mpls_nh_assign_dev(net, rt, nh, oif); if (err) goto errout; return 0; errout: return err; } static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, u8 cfg_via_alen, u8 *max_via_alen, u8 *max_labels) { int remaining = len; u8 nhs = 0; *max_via_alen = 0; *max_labels = 0; while (rtnh_ok(rtnh, remaining)) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); int attrlen; u8 n_labels = 0; attrlen = rtnh_attrlen(rtnh); nla = nla_find(attrs, attrlen, RTA_VIA); if (nla && nla_len(nla) >= offsetof(struct rtvia, rtvia_addr)) { int via_alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); if (via_alen <= MAX_VIA_ALEN) *max_via_alen = max_t(u16, *max_via_alen, via_alen); } nla = nla_find(attrs, attrlen, RTA_NEWDST); if (nla && nla_get_labels(nla, MAX_NEW_LABELS, &n_labels, NULL, NULL) != 0) return 0; *max_labels = max_t(u8, *max_labels, n_labels); /* number of nexthops is tracked by a u8. * Check for overflow. */ if (nhs == 255) return 0; nhs++; rtnh = rtnh_next(rtnh, &remaining); } /* leftover implies invalid nexthop configuration, discard it */ return remaining > 0 ? 0 : nhs; } static int mpls_nh_build_multi(struct mpls_route_config *cfg, struct mpls_route *rt, u8 max_labels, struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = cfg->rc_mp; struct nlattr *nla_via, *nla_newdst; int remaining = cfg->rc_mp_len; int err = 0; rt->rt_nhn = 0; change_nexthops(rt) { int attrlen; nla_via = NULL; nla_newdst = NULL; err = -EINVAL; if (!rtnh_ok(rtnh, remaining)) goto errout; /* neither weighted multipath nor any flags * are supported */ if (rtnh->rtnh_hops || rtnh->rtnh_flags) goto errout; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *attrs = rtnh_attrs(rtnh); nla_via = nla_find(attrs, attrlen, RTA_VIA); nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST); } err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh, rtnh->rtnh_ifindex, nla_via, nla_newdst, max_labels, extack); if (err) goto errout; if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) rt->rt_nhn_alive--; rtnh = rtnh_next(rtnh, &remaining); rt->rt_nhn++; } endfor_nexthops(rt); return 0; errout: return err; } static bool mpls_label_ok(struct net *net, unsigned int *index, struct netlink_ext_ack *extack) { /* Reserved labels may not be set */ if (*index < MPLS_LABEL_FIRST_UNRESERVED) { NL_SET_ERR_MSG(extack, "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher"); return false; } /* The full 20 bit range may not be supported. */ if (*index >= net->mpls.platform_labels) { NL_SET_ERR_MSG(extack, "Label >= configured maximum in platform_labels"); return false; } *index = array_index_nospec(*index, net->mpls.platform_labels); return true; } static int mpls_route_add(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_route *rt, *old; int err = -EINVAL; u8 max_via_alen; unsigned index; u8 max_labels; u8 nhs; index = cfg->rc_label; /* If a label was not specified during insert pick one */ if ((index == LABEL_NOT_SPECIFIED) && (cfg->rc_nlflags & NLM_F_CREATE)) { index = find_free_label(net); } if (!mpls_label_ok(net, &index, extack)) goto errout; /* Append makes no sense with mpls */ err = -EOPNOTSUPP; if (cfg->rc_nlflags & NLM_F_APPEND) { NL_SET_ERR_MSG(extack, "MPLS does not support route append"); goto errout; } err = -EEXIST; old = mpls_route_input(net, index); if ((cfg->rc_nlflags & NLM_F_EXCL) && old) goto errout; err = -EEXIST; if (!(cfg->rc_nlflags & NLM_F_REPLACE) && old) goto errout; err = -ENOENT; if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old) goto errout; err = -EINVAL; if (cfg->rc_mp) { nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len, cfg->rc_via_alen, &max_via_alen, &max_labels); } else { max_via_alen = cfg->rc_via_alen; max_labels = cfg->rc_output_labels; nhs = 1; } if (nhs == 0) { NL_SET_ERR_MSG(extack, "Route does not contain a nexthop"); goto errout; } rt = mpls_rt_alloc(nhs, max_via_alen, max_labels); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto errout; } rt->rt_protocol = cfg->rc_protocol; rt->rt_payload_type = cfg->rc_payload_type; rt->rt_ttl_propagate = cfg->rc_ttl_propagate; if (cfg->rc_mp) err = mpls_nh_build_multi(cfg, rt, max_labels, extack); else err = mpls_nh_build_from_cfg(cfg, rt); if (err) goto freert; mpls_route_update(net, index, rt, &cfg->rc_nlinfo); return 0; freert: mpls_rt_free(rt); errout: return err; } static int mpls_route_del(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct net *net = cfg->rc_nlinfo.nl_net; unsigned index; int err = -EINVAL; index = cfg->rc_label; if (!mpls_label_ok(net, &index, extack)) goto errout; mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); err = 0; errout: return err; } static void mpls_get_stats(struct mpls_dev *mdev, struct mpls_link_stats *stats) { struct mpls_pcpu_stats *p; int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { struct mpls_link_stats local; unsigned int start; p = per_cpu_ptr(mdev->stats, i); do { start = u64_stats_fetch_begin(&p->syncp); local = p->stats; } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; stats->tx_packets += local.tx_packets; stats->tx_bytes += local.tx_bytes; stats->rx_errors += local.rx_errors; stats->tx_errors += local.tx_errors; stats->rx_dropped += local.rx_dropped; stats->tx_dropped += local.tx_dropped; stats->rx_noroute += local.rx_noroute; } } static int mpls_fill_stats_af(struct sk_buff *skb, const struct net_device *dev) { struct mpls_link_stats *stats; struct mpls_dev *mdev; struct nlattr *nla; mdev = mpls_dev_rcu(dev); if (!mdev) return -ENODATA; nla = nla_reserve_64bit(skb, MPLS_STATS_LINK, sizeof(struct mpls_link_stats), MPLS_STATS_UNSPEC); if (!nla) return -EMSGSIZE; stats = nla_data(nla); mpls_get_stats(mdev, stats); return 0; } static size_t mpls_get_stats_af_size(const struct net_device *dev) { struct mpls_dev *mdev; mdev = mpls_dev_rcu(dev); if (!mdev) return 0; return nla_total_size_64bit(sizeof(struct mpls_link_stats)); } static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_MPLS; if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0) goto nla_put_failure; if ((all || type == NETCONFA_INPUT) && nla_put_s32(skb, NETCONFA_INPUT, READ_ONCE(mdev->input_enabled)) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int mpls_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_INPUT) size += nla_total_size(4); return size; } static void mpls_netconf_notify_devconf(struct net *net, int event, int type, struct mpls_dev *mdev) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err); } static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, }; static int mpls_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_mpls_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_mpls_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int mpls_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; struct net_device *dev; struct mpls_dev *mdev; struct sk_buff *skb; int ifindex; int err; err = mpls_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; if (!tb[NETCONFA_IFINDEX]) { err = -EINVAL; goto errout; } ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { err = -EINVAL; goto errout_unlock; } mdev = mpls_dev_rcu(dev); if (!mdev) { err = -EINVAL; goto errout_unlock; } err = mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); goto errout_unlock; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); rcu_read_unlock(); errout: return err; errout_unlock: rcu_read_unlock(); kfree_skb(skb); goto errout; } static int mpls_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; } *ctx = (void *)cb->ctx; struct net_device *dev; struct mpls_dev *mdev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { mdev = mpls_dev_rcu(dev); if (!mdev) continue; err = mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) break; } rcu_read_unlock(); return err; } #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { struct mpls_dev *mdev = ctl->extra1; int i = (int *)ctl->data - (int *)mdev; struct net *net = ctl->extra2; int val = *(int *)ctl->data; if (i == offsetof(struct mpls_dev, input_enabled) && val != oval) { mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_INPUT, mdev); } } return ret; } static const struct ctl_table mpls_dev_table[] = { { .procname = "input", .maxlen = sizeof(int), .mode = 0644, .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, }; static int mpls_dev_sysctl_register(struct net_device *dev, struct mpls_dev *mdev) { char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; size_t table_size = ARRAY_SIZE(mpls_dev_table); struct net *net = dev_net(dev); struct ctl_table *table; int i; table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL); if (!table) goto out; /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ for (i = 0; i < table_size; i++) { table[i].data = (char *)mdev + (uintptr_t)table[i].data; table[i].extra1 = mdev; table[i].extra2 = net; } snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); mdev->sysctl = register_net_sysctl_sz(net, path, table, table_size); if (!mdev->sysctl) goto free; mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, mdev); return 0; free: kfree(table); out: mdev->sysctl = NULL; return -ENOBUFS; } static void mpls_dev_sysctl_unregister(struct net_device *dev, struct mpls_dev *mdev) { struct net *net = dev_net(dev); const struct ctl_table *table; if (!mdev->sysctl) return; table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); mpls_netconf_notify_devconf(net, RTM_DELNETCONF, 0, mdev); } static struct mpls_dev *mpls_add_dev(struct net_device *dev) { struct mpls_dev *mdev; int err = -ENOMEM; int i; mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(err); mdev->stats = alloc_percpu(struct mpls_pcpu_stats); if (!mdev->stats) goto free; for_each_possible_cpu(i) { struct mpls_pcpu_stats *mpls_stats; mpls_stats = per_cpu_ptr(mdev->stats, i); u64_stats_init(&mpls_stats->syncp); } mdev->dev = dev; err = mpls_dev_sysctl_register(dev, mdev); if (err) goto free; rcu_assign_pointer(dev->mpls_ptr, mdev); return mdev; free: free_percpu(mdev->stats); kfree(mdev); return ERR_PTR(err); } static void mpls_dev_destroy_rcu(struct rcu_head *head) { struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu); free_percpu(mdev->stats); kfree(mdev); } static int mpls_ifdown(struct net_device *dev, int event) { struct net *net = dev_net(dev); unsigned int index; for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt; bool nh_del = false; u8 alive = 0; rt = mpls_route_input(net, index); if (!rt) continue; if (event == NETDEV_UNREGISTER) { u8 deleted = 0; for_nexthops(rt) { if (!nh->nh_dev || nh->nh_dev == dev) deleted++; if (nh->nh_dev == dev) nh_del = true; } endfor_nexthops(rt); /* if there are no more nexthops, delete the route */ if (deleted == rt->rt_nhn) { mpls_route_update(net, index, NULL, NULL); continue; } if (nh_del) { size_t size = sizeof(*rt) + rt->rt_nhn * rt->rt_nh_size; struct mpls_route *orig = rt; rt = kmemdup(orig, size, GFP_KERNEL); if (!rt) return -ENOMEM; } } change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; if (nh->nh_dev != dev) { if (nh_del) netdev_hold(nh->nh_dev, &nh->nh_dev_tracker, GFP_KERNEL); goto next; } switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nh_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: nh_flags |= RTNH_F_LINKDOWN; break; } if (event == NETDEV_UNREGISTER) nh->nh_dev = NULL; if (nh->nh_flags != nh_flags) WRITE_ONCE(nh->nh_flags, nh_flags); next: if (!(nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))) alive++; } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); if (nh_del) mpls_route_update(net, index, rt, NULL); } return 0; } static void mpls_ifup(struct net_device *dev, unsigned int flags) { struct net *net = dev_net(dev); unsigned int index; u8 alive; for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt; rt = mpls_route_input(net, index); if (!rt) continue; alive = 0; change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; if (!(nh_flags & flags)) { alive++; continue; } if (nh->nh_dev != dev) continue; alive++; nh_flags &= ~flags; WRITE_ONCE(nh->nh_flags, nh_flags); } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); } } static int mpls_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mpls_dev *mdev; unsigned int flags; int err; mutex_lock(&net->mpls.platform_mutex); if (event == NETDEV_REGISTER) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) { err = PTR_ERR(mdev); goto err; } goto out; } mdev = mpls_dev_get(net, dev); if (!mdev) goto out; switch (event) { case NETDEV_DOWN: err = mpls_ifdown(dev, event); if (err) goto err; break; case NETDEV_UP: flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); else mpls_ifup(dev, RTNH_F_DEAD); break; case NETDEV_CHANGE: flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) { mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); } else { err = mpls_ifdown(dev, event); if (err) goto err; } break; case NETDEV_UNREGISTER: err = mpls_ifdown(dev, event); if (err) goto err; mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); call_rcu(&mdev->rcu, mpls_dev_destroy_rcu); } break; case NETDEV_CHANGENAME: mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); err = mpls_dev_sysctl_register(dev, mdev); if (err) goto err; } break; } out: mutex_unlock(&net->mpls.platform_mutex); return NOTIFY_OK; err: mutex_unlock(&net->mpls.platform_mutex); return notifier_from_errno(err); } static struct notifier_block mpls_dev_notifier = { .notifier_call = mpls_dev_notify, }; static int nla_put_via(struct sk_buff *skb, u8 table, const void *addr, int alen) { static const int table_to_family[NEIGH_NR_TABLES + 1] = { AF_INET, AF_INET6, AF_PACKET, }; struct nlattr *nla; struct rtvia *via; int family = AF_UNSPEC; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) return -EMSGSIZE; if (table <= NEIGH_NR_TABLES) family = table_to_family[table]; via = nla_data(nla); via->rtvia_family = family; memcpy(via->rtvia_addr, addr, alen); return 0; } int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]) { struct nlattr *nla; struct mpls_shim_hdr *nla_label; bool bos; int i; nla = nla_reserve(skb, attrtype, labels*4); if (!nla) return -EMSGSIZE; nla_label = nla_data(nla); bos = true; for (i = labels - 1; i >= 0; i--) { nla_label[i] = mpls_entry_encode(label[i], 0, 0, bos); bos = false; } return 0; } EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[], struct netlink_ext_ack *extack) { unsigned len = nla_len(nla); struct mpls_shim_hdr *nla_label; u8 nla_labels; bool bos; int i; /* len needs to be an even multiple of 4 (the label size). Number * of labels is a u8 so check for overflow. */ if (len & 3 || len / 4 > 255) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid length for labels attribute"); return -EINVAL; } /* Limit the number of new labels allowed */ nla_labels = len/4; if (nla_labels > max_labels) { NL_SET_ERR_MSG(extack, "Too many labels"); return -EINVAL; } /* when label == NULL, caller wants number of labels */ if (!label) goto out; nla_label = nla_data(nla); bos = true; for (i = nla_labels - 1; i >= 0; i--, bos = false) { struct mpls_entry_decoded dec; dec = mpls_entry_decode(nla_label + i); /* Ensure the bottom of stack flag is properly set * and ttl and tc are both clear. */ if (dec.ttl) { NL_SET_ERR_MSG_ATTR(extack, nla, "TTL in label must be 0"); return -EINVAL; } if (dec.tc) { NL_SET_ERR_MSG_ATTR(extack, nla, "Traffic class in label must be 0"); return -EINVAL; } if (dec.bos != bos) { NL_SET_BAD_ATTR(extack, nla); if (bos) { NL_SET_ERR_MSG(extack, "BOS bit must be set in first label"); } else { NL_SET_ERR_MSG(extack, "BOS bit can only be set in first label"); } return -EINVAL; } switch (dec.label) { case MPLS_LABEL_IMPLNULL: /* RFC3032: This is a label that an LSR may * assign and distribute, but which never * actually appears in the encapsulation. */ NL_SET_ERR_MSG_ATTR(extack, nla, "Implicit NULL Label (3) can not be used in encapsulation"); return -EINVAL; } label[i] = dec.label; } out: *labels = nla_labels; return 0; } EXPORT_SYMBOL_GPL(nla_get_labels); static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; int index; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); if (rtm->rtm_family != AF_MPLS) { NL_SET_ERR_MSG(extack, "Invalid address family in rtmsg"); goto errout; } if (rtm->rtm_dst_len != 20) { NL_SET_ERR_MSG(extack, "rtm_dst_len must be 20 for MPLS"); goto errout; } if (rtm->rtm_src_len != 0) { NL_SET_ERR_MSG(extack, "rtm_src_len must be 0 for MPLS"); goto errout; } if (rtm->rtm_tos != 0) { NL_SET_ERR_MSG(extack, "rtm_tos must be 0 for MPLS"); goto errout; } if (rtm->rtm_table != RT_TABLE_MAIN) { NL_SET_ERR_MSG(extack, "MPLS only supports the main route table"); goto errout; } /* Any value is acceptable for rtm_protocol */ /* As mpls uses destination specific addresses * (or source specific address in the case of multicast) * all addresses have universal scope. */ if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) { NL_SET_ERR_MSG(extack, "Invalid route scope - MPLS only supports UNIVERSE"); goto errout; } if (rtm->rtm_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Invalid route type - MPLS only supports UNICAST"); goto errout; } if (rtm->rtm_flags != 0) { NL_SET_ERR_MSG(extack, "rtm_flags must be 0 for MPLS"); goto errout; } cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; cfg->rc_ttl_propagate = MPLS_TTL_PROP_DEFAULT; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; cfg->rc_nlinfo.nl_net = sock_net(skb->sk); for (index = 0; index <= RTA_MAX; index++) { struct nlattr *nla = tb[index]; if (!nla) continue; switch (index) { case RTA_OIF: cfg->rc_ifindex = nla_get_u32(nla); break; case RTA_NEWDST: if (nla_get_labels(nla, MAX_NEW_LABELS, &cfg->rc_output_labels, cfg->rc_output_label, extack)) goto errout; break; case RTA_DST: { u8 label_count; if (nla_get_labels(nla, 1, &label_count, &cfg->rc_label, extack)) goto errout; if (!mpls_label_ok(cfg->rc_nlinfo.nl_net, &cfg->rc_label, extack)) goto errout; break; } case RTA_GATEWAY: NL_SET_ERR_MSG(extack, "MPLS does not support RTA_GATEWAY attribute"); goto errout; case RTA_VIA: { if (nla_get_via(nla, &cfg->rc_via_alen, &cfg->rc_via_table, cfg->rc_via, extack)) goto errout; break; } case RTA_MULTIPATH: { cfg->rc_mp = nla_data(nla); cfg->rc_mp_len = nla_len(nla); break; } case RTA_TTL_PROPAGATE: { u8 ttl_propagate = nla_get_u8(nla); if (ttl_propagate > 1) { NL_SET_ERR_MSG_ATTR(extack, nla, "RTA_TTL_PROPAGATE can only be 0 or 1"); goto errout; } cfg->rc_ttl_propagate = ttl_propagate ? MPLS_TTL_PROP_ENABLED : MPLS_TTL_PROP_DISABLED; break; } default: NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute"); /* Unsupported attribute */ goto errout; } } err = 0; errout: return err; } static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); if (!cfg) return -ENOMEM; err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; mutex_lock(&net->mpls.platform_mutex); err = mpls_route_del(cfg, extack); mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); return err; } static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); if (!cfg) return -ENOMEM; err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; mutex_lock(&net->mpls.platform_mutex); err = mpls_route_add(cfg, extack); mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); return err; } static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 label, struct mpls_route *rt, int flags) { struct net_device *dev; struct nlmsghdr *nlh; struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_MPLS; rtm->rtm_dst_len = 20; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; rtm->rtm_protocol = rt->rt_protocol; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_labels(skb, RTA_DST, 1, &label)) goto nla_put_failure; if (rt->rt_ttl_propagate != MPLS_TTL_PROP_DEFAULT) { bool ttl_propagate = rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED; if (nla_put_u8(skb, RTA_TTL_PROPAGATE, ttl_propagate)) goto nla_put_failure; } if (rt->rt_nhn == 1) { const struct mpls_nh *nh = rt->rt_nh; if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (nh->nh_flags & RTNH_F_LINKDOWN) rtm->rtm_flags |= RTNH_F_LINKDOWN; if (nh->nh_flags & RTNH_F_DEAD) rtm->rtm_flags |= RTNH_F_DEAD; } else { struct rtnexthop *rtnh; struct nlattr *mp; u8 linkdown = 0; u8 dead = 0; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; for_nexthops(rt) { dev = nh->nh_dev; if (!dev) continue; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_ifindex = dev->ifindex; if (nh->nh_flags & RTNH_F_LINKDOWN) { rtnh->rtnh_flags |= RTNH_F_LINKDOWN; linkdown++; } if (nh->nh_flags & RTNH_F_DEAD) { rtnh->rtnh_flags |= RTNH_F_DEAD; dead++; } if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; } endfor_nexthops(rt); if (linkdown == rt->rt_nhn) rtm->rtm_flags |= RTNH_F_LINKDOWN; if (dead == rt->rt_nhn) rtm->rtm_flags |= RTNH_F_DEAD; nla_nest_end(skb, mp); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } #if IS_ENABLED(CONFIG_INET) static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { return ip_valid_fib_dump_req(net, nlh, filter, cb); } #else static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); return -EINVAL; } if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_protocol) { filter->protocol = rtm->rtm_protocol; filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (i == RTA_OIF) { ifindex = nla_get_u32(tb[i]); filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; filter->filter_set = 1; } else if (tb[i]) { NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request"); return -EINVAL; } } return 0; } #endif static bool mpls_rt_uses_dev(struct mpls_route *rt, const struct net_device *dev) { if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; if (nh->nh_dev == dev) return true; } else { for_nexthops(rt) { if (nh->nh_dev == dev) return true; } endfor_nexthops(rt); } return false; } static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; struct fib_dump_filter filter = { .rtnl_held = false, }; unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; int err; rcu_read_lock(); if (cb->strict_check) { err = mpls_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto err; /* for MPLS, there is only 1 table with fixed type and flags. * If either are set in the filter then return nothing. */ if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) || (filter.rt_type && filter.rt_type != RTN_UNICAST) || filter.flags) goto unlock; } index = cb->args[0]; if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; platform_label = rcu_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; if (filter.filter_set) flags |= NLM_F_DUMP_FILTERED; for (; index < platform_labels; index++) { struct mpls_route *rt; rt = rcu_dereference(platform_label[index]); if (!rt) continue; if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) || (filter.protocol && rt->rt_protocol != filter.protocol)) continue; if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, index, rt, flags) < 0) break; } cb->args[0] = index; unlock: rcu_read_unlock(); return skb->len; err: rcu_read_unlock(); return err; } static inline size_t lfib_nlmsg_size(struct mpls_route *rt) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_DST */ + nla_total_size(1); /* RTA_TTL_PROPAGATE */ if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; if (nh->nh_dev) payload += nla_total_size(4); /* RTA_OIF */ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */ payload += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) /* RTA_NEWDST */ payload += nla_total_size(nh->nh_labels * 4); } else { /* each nexthop is packed in an attribute */ size_t nhsize = 0; for_nexthops(rt) { if (!nh->nh_dev) continue; nhsize += nla_total_size(sizeof(struct rtnexthop)); /* RTA_VIA */ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) nhsize += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) nhsize += nla_total_size(nh->nh_labels * 4); } endfor_nexthops(rt); /* nested attribute */ payload += nla_total_size(nhsize); } return payload; } static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags) { struct sk_buff *skb; u32 seq = nlh ? nlh->nlmsg_seq : 0; int err = -ENOBUFS; skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); if (skb == NULL) goto errout; err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); } static int mpls_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); return -EINVAL; } if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { NL_SET_ERR_MSG_MOD(extack, "Invalid flags for get route request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err) return err; if ((tb[RTA_DST] || tb[RTA_NEWDST]) && !rtm->rtm_dst_len) { NL_SET_ERR_MSG_MOD(extack, "rtm_dst_len must be 20 for MPLS"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_DST: case RTA_NEWDST: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); return -EINVAL; } } return 0; } static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); u32 portid = NETLINK_CB(in_skb).portid; u32 in_label = LABEL_NOT_SPECIFIED; struct nlattr *tb[RTA_MAX + 1]; struct mpls_route *rt = NULL; u32 labels[MAX_NEW_LABELS]; struct mpls_shim_hdr *hdr; unsigned int hdr_size = 0; const struct mpls_nh *nh; struct net_device *dev; struct rtmsg *rtm, *r; struct nlmsghdr *nlh; struct sk_buff *skb; u8 n_labels; int err; mutex_lock(&net->mpls.platform_mutex); err = mpls_valid_getroute_req(in_skb, in_nlh, tb, extack); if (err < 0) goto errout; rtm = nlmsg_data(in_nlh); if (tb[RTA_DST]) { u8 label_count; if (nla_get_labels(tb[RTA_DST], 1, &label_count, &in_label, extack)) { err = -EINVAL; goto errout; } if (!mpls_label_ok(net, &in_label, extack)) { err = -EINVAL; goto errout; } } if (in_label < net->mpls.platform_labels) rt = mpls_route_input(net, in_label); if (!rt) { err = -ENETUNREACH; goto errout; } if (rtm->rtm_flags & RTM_F_FIB_MATCH) { skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } err = mpls_dump_route(skb, portid, in_nlh->nlmsg_seq, RTM_NEWROUTE, in_label, rt, 0); if (err < 0) { /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, portid); goto errout; } if (tb[RTA_NEWDST]) { if (nla_get_labels(tb[RTA_NEWDST], MAX_NEW_LABELS, &n_labels, labels, extack) != 0) { err = -EINVAL; goto errout; } hdr_size = n_labels * sizeof(struct mpls_shim_hdr); } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } skb->protocol = htons(ETH_P_MPLS_UC); if (hdr_size) { bool bos; int i; if (skb_cow(skb, hdr_size)) { err = -ENOBUFS; goto errout_free; } skb_reserve(skb, hdr_size); skb_push(skb, hdr_size); skb_reset_network_header(skb); /* Push new labels */ hdr = mpls_hdr(skb); bos = true; for (i = n_labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(labels[i], 1, 0, bos); bos = false; } } nh = mpls_select_multipath(rt, skb); if (!nh) { err = -ENETUNREACH; goto errout_free; } if (hdr_size) { skb_pull(skb, hdr_size); skb_reset_network_header(skb); } nlh = nlmsg_put(skb, portid, in_nlh->nlmsg_seq, RTM_NEWROUTE, sizeof(*r), 0); if (!nlh) { err = -EMSGSIZE; goto errout_free; } r = nlmsg_data(nlh); r->rtm_family = AF_MPLS; r->rtm_dst_len = 20; r->rtm_src_len = 0; r->rtm_table = RT_TABLE_MAIN; r->rtm_type = RTN_UNICAST; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = rt->rt_protocol; r->rtm_flags = 0; if (nla_put_labels(skb, RTA_DST, 1, &in_label)) goto nla_put_failure; if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; nlmsg_end(skb, nlh); err = rtnl_unicast(skb, net, portid); errout: mutex_unlock(&net->mpls.platform_mutex); return err; nla_put_failure: nlmsg_cancel(skb, nlh); err = -EMSGSIZE; errout_free: mutex_unlock(&net->mpls.platform_mutex); kfree_skb(skb); return err; } static int resize_platform_label_table(struct net *net, size_t limit) { size_t size = sizeof(struct mpls_route *) * limit; size_t old_limit; size_t cp_size; struct mpls_route __rcu **labels = NULL, **old; struct mpls_route *rt0 = NULL, *rt2 = NULL; unsigned index; if (size) { labels = kvzalloc(size, GFP_KERNEL); if (!labels) goto nolabels; } /* In case the predefined labels need to be populated */ if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt0)) goto nort0; rt0->rt_nh->nh_dev = lo; netdev_hold(lo, &rt0->rt_nh->nh_dev_tracker, GFP_KERNEL); rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt0->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr, lo->addr_len); } if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt2)) goto nort2; rt2->rt_nh->nh_dev = lo; netdev_hold(lo, &rt2->rt_nh->nh_dev_tracker, GFP_KERNEL); rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; rt2->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt2->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr, lo->addr_len); } mutex_lock(&net->mpls.platform_mutex); /* Remember the original table */ old = mpls_dereference(net, net->mpls.platform_label); old_limit = net->mpls.platform_labels; /* Free any labels beyond the new table */ for (index = limit; index < old_limit; index++) mpls_route_update(net, index, NULL, NULL); /* Copy over the old labels */ cp_size = size; if (old_limit < limit) cp_size = old_limit * sizeof(struct mpls_route *); memcpy(labels, old, cp_size); /* If needed set the predefined labels */ if ((old_limit <= MPLS_LABEL_IPV6NULL) && (limit > MPLS_LABEL_IPV6NULL)) { RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2); rt2 = NULL; } if ((old_limit <= MPLS_LABEL_IPV4NULL) && (limit > MPLS_LABEL_IPV4NULL)) { RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0); rt0 = NULL; } /* Update the global pointers */ net->mpls.platform_labels = limit; rcu_assign_pointer(net->mpls.platform_label, labels); mutex_unlock(&net->mpls.platform_mutex); mpls_rt_free(rt2); mpls_rt_free(rt0); if (old) { synchronize_rcu(); kvfree(old); } return 0; nort2: mpls_rt_free(rt0); nort0: kvfree(labels); nolabels: return -ENOMEM; } static int mpls_platform_labels(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; int ret; struct ctl_table tmp = { .procname = table->procname, .data = &platform_labels, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = &label_limit, }; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = resize_platform_label_table(net, platform_labels); return ret; } #define MPLS_NS_SYSCTL_OFFSET(field) \ (&((struct net *)0)->field) static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = mpls_platform_labels, }, { .procname = "ip_ttl_propagate", .data = MPLS_NS_SYSCTL_OFFSET(mpls.ip_ttl_propagate), .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "default_ttl", .data = MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl), .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, }; static __net_init int mpls_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(mpls_table); struct ctl_table *table; int i; mutex_init(&net->mpls.platform_mutex); net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; net->mpls.ip_ttl_propagate = 1; net->mpls.default_ttl = 255; table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); if (table == NULL) return -ENOMEM; /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ for (i = 0; i < table_size; i++) table[i].data = (char *)net + (uintptr_t)table[i].data; net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, table_size); if (net->mpls.ctl == NULL) { kfree(table); return -ENOMEM; } return 0; } static __net_exit void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; const struct ctl_table *table; unsigned int index; table = net->mpls.ctl->ctl_table_arg; unregister_net_sysctl_table(net->mpls.ctl); kfree(table); /* An rcu grace period has passed since there was a device in * the network namespace (and thus the last in flight packet) * left this network namespace. This is because * unregister_netdevice_many and netdev_run_todo has completed * for each network device that was in this network namespace. * * As such no additional rcu synchronization is necessary when * freeing the platform_label table. */ mutex_lock(&net->mpls.platform_mutex); platform_label = mpls_dereference(net, net->mpls.platform_label); platform_labels = net->mpls.platform_labels; for (index = 0; index < platform_labels; index++) { struct mpls_route *rt; rt = mpls_dereference(net, platform_label[index]); mpls_notify_route(net, index, rt, NULL, NULL); mpls_rt_free(rt); } mutex_unlock(&net->mpls.platform_mutex); kvfree(platform_label); } static struct pernet_operations mpls_net_ops = { .init = mpls_net_init, .exit = mpls_net_exit, }; static struct rtnl_af_ops mpls_af_ops __read_mostly = { .family = AF_MPLS, .fill_stats_af = mpls_fill_stats_af, .get_stats_af_size = mpls_get_stats_af_size, }; static const struct rtnl_msg_handler mpls_rtnl_msg_handlers[] __initdata_or_module = { {THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, RTNL_FLAG_DOIT_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, RTNL_FLAG_DOIT_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init mpls_init(void) { int err; BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4); err = register_pernet_subsys(&mpls_net_ops); if (err) goto out; err = register_netdevice_notifier(&mpls_dev_notifier); if (err) goto out_unregister_pernet; dev_add_pack(&mpls_packet_type); err = rtnl_af_register(&mpls_af_ops); if (err) goto out_unregister_dev_type; err = rtnl_register_many(mpls_rtnl_msg_handlers); if (err) goto out_unregister_rtnl_af; err = ipgre_tunnel_encap_add_mpls_ops(); if (err) { pr_err("Can't add mpls over gre tunnel ops\n"); goto out_unregister_rtnl; } err = 0; out: return err; out_unregister_rtnl: rtnl_unregister_many(mpls_rtnl_msg_handlers); out_unregister_rtnl_af: rtnl_af_unregister(&mpls_af_ops); out_unregister_dev_type: dev_remove_pack(&mpls_packet_type); out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); goto out; } module_init(mpls_init); static void __exit mpls_exit(void) { rtnl_unregister_all(PF_MPLS); rtnl_af_unregister(&mpls_af_ops); dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); ipgre_tunnel_encap_del_mpls_ops(); } module_exit(mpls_exit); MODULE_DESCRIPTION("MultiProtocol Label Switching"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_NETPROTO(PF_MPLS);
138 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/condition.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* List of "struct tomoyo_condition". */ LIST_HEAD(tomoyo_condition_list); /** * tomoyo_argv - Check argv[] in "struct linux_binbrm". * * @index: Index number of @arg_ptr. * @arg_ptr: Contents of argv[@index]. * @argc: Length of @argv. * @argv: Pointer to "struct tomoyo_argv". * @checked: Set to true if @argv[@index] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr, const int argc, const struct tomoyo_argv *argv, u8 *checked) { int i; struct tomoyo_path_info arg; arg.name = arg_ptr; for (i = 0; i < argc; argv++, checked++, i++) { bool result; if (index != argv->index) continue; *checked = 1; tomoyo_fill_path_info(&arg); result = tomoyo_path_matches_pattern(&arg, argv->value); if (argv->is_not) result = !result; if (!result) return false; } return true; } /** * tomoyo_envp - Check envp[] in "struct linux_binbrm". * * @env_name: The name of environment variable. * @env_value: The value of environment variable. * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * @checked: Set to true if @envp[@env_name] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_envp(const char *env_name, const char *env_value, const int envc, const struct tomoyo_envp *envp, u8 *checked) { int i; struct tomoyo_path_info name; struct tomoyo_path_info value; name.name = env_name; tomoyo_fill_path_info(&name); value.name = env_value; tomoyo_fill_path_info(&value); for (i = 0; i < envc; envp++, checked++, i++) { bool result; if (!tomoyo_path_matches_pattern(&name, envp->name)) continue; *checked = 1; if (envp->value) { result = tomoyo_path_matches_pattern(&value, envp->value); if (envp->is_not) result = !result; } else { result = true; if (!envp->is_not) result = !result; } if (!result) return false; } return true; } /** * tomoyo_scan_bprm - Scan "struct linux_binprm". * * @ee: Pointer to "struct tomoyo_execve". * @argc: Length of @argc. * @argv: Pointer to "struct tomoyo_argv". * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee, const u16 argc, const struct tomoyo_argv *argv, const u16 envc, const struct tomoyo_envp *envp) { struct linux_binprm *bprm = ee->bprm; struct tomoyo_page_dump *dump = &ee->dump; char *arg_ptr = ee->tmp; int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool result = true; u8 local_checked[32]; u8 *checked; if (argc + envc <= sizeof(local_checked)) { checked = local_checked; memset(local_checked, 0, sizeof(local_checked)); } else { checked = kzalloc(argc + envc, GFP_NOFS); if (!checked) return false; } while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) { result = false; goto out; } pos += PAGE_SIZE - offset; while (offset < PAGE_SIZE) { /* Read. */ const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; /* Check. */ if (argv_count) { if (!tomoyo_argv(bprm->argc - argv_count, arg_ptr, argc, argv, checked)) { result = false; break; } argv_count--; } else if (envp_count) { char *cp = strchr(arg_ptr, '='); if (cp) { *cp = '\0'; if (!tomoyo_envp(arg_ptr, cp + 1, envc, envp, checked + argc)) { result = false; break; } } envp_count--; } else { break; } arg_len = 0; } offset = 0; if (!result) break; } out: if (result) { int i; /* Check not-yet-checked entries. */ for (i = 0; i < argc; i++) { if (checked[i]) continue; /* * Return true only if all unchecked indexes in * bprm->argv[] are not matched. */ if (argv[i].is_not) continue; result = false; break; } for (i = 0; i < envc; envp++, i++) { if (checked[argc + i]) continue; /* * Return true only if all unchecked environ variables * in bprm->envp[] are either undefined or not matched. */ if ((!envp->value && !envp->is_not) || (envp->value && envp->is_not)) continue; result = false; break; } } if (checked != local_checked) kfree(checked); return result; } /** * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition". * * @file: Pointer to "struct file". * @ptr: Pointer to "struct tomoyo_name_union". * @match: True if "exec.realpath=", false if "exec.realpath!=". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_exec_realpath(struct file *file, const struct tomoyo_name_union *ptr, const bool match) { bool result; struct tomoyo_path_info exe; if (!file) return false; exe.name = tomoyo_realpath_from_path(&file->f_path); if (!exe.name) return false; tomoyo_fill_path_info(&exe); result = tomoyo_compare_name_union(&exe, ptr); kfree(exe.name); return result == match; } /** * tomoyo_get_dqword - tomoyo_get_name() for a quoted string. * * @start: String to save. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) { char *cp = start + strlen(start) - 1; if (cp == start || *start++ != '"' || *cp != '"') return NULL; *cp = '\0'; if (*start && !tomoyo_correct_word(start)) return NULL; return tomoyo_get_name(start); } /** * tomoyo_parse_name_union_quoted - Parse a quoted word. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename = param->data; if (*filename == '@') return tomoyo_parse_name_union(param, ptr); ptr->filename = tomoyo_get_dqword(filename); return ptr->filename != NULL; } /** * tomoyo_parse_argv - Parse an argv[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @argv: Pointer to "struct tomoyo_argv". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_argv(char *left, char *right, struct tomoyo_argv *argv) { if (tomoyo_parse_ulong(&argv->index, &left) != TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left) return false; argv->value = tomoyo_get_dqword(right); return argv->value != NULL; } /** * tomoyo_parse_envp - Parse an envp[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_envp(char *left, char *right, struct tomoyo_envp *envp) { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; char *cp = left + strlen(left) - 1; if (*cp-- != ']' || *cp != '"') goto out; *cp = '\0'; if (!tomoyo_correct_word(left)) goto out; name = tomoyo_get_name(left); if (!name) goto out; if (!strcmp(right, "NULL")) { value = NULL; } else { value = tomoyo_get_dqword(right); if (!value) { tomoyo_put_name(name); goto out; } } envp->name = name; envp->value = value; return true; out: return false; } /** * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry. * * @a: Pointer to "struct tomoyo_condition". * @b: Pointer to "struct tomoyo_condition". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a, const struct tomoyo_condition *b) { return a->size == b->size && a->condc == b->condc && a->numbers_count == b->numbers_count && a->names_count == b->names_count && a->argc == b->argc && a->envc == b->envc && a->grant_log == b->grant_log && a->transit == b->transit && !memcmp(a + 1, b + 1, a->size - sizeof(*a)); } /** * tomoyo_condition_type - Get condition type. * * @word: Keyword string. * * Returns one of values in "enum tomoyo_conditions_index" on success, * TOMOYO_MAX_CONDITION_KEYWORD otherwise. */ static u8 tomoyo_condition_type(const char *word) { u8 i; for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { if (!strcmp(word, tomoyo_condition_keyword[i])) break; } return i; } /* Define this to enable debug mode. */ /* #define DEBUG_CONDITION */ #ifdef DEBUG_CONDITION #define dprintk printk #else #define dprintk(...) do { } while (0) #endif /** * tomoyo_commit_condition - Commit "struct tomoyo_condition". * * @entry: Pointer to "struct tomoyo_condition". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. * * This function merges duplicated entries. This function returns NULL if * @entry is not duplicated but memory quota for policy has exceeded. */ static struct tomoyo_condition *tomoyo_commit_condition (struct tomoyo_condition *entry) { struct tomoyo_condition *ptr; bool found = false; if (mutex_lock_interruptible(&tomoyo_policy_lock)) { dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); ptr = NULL; found = true; goto out; } list_for_each_entry(ptr, &tomoyo_condition_list, head.list) { if (!tomoyo_same_condition(ptr, entry) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; /* Same entry found. Share this entry. */ atomic_inc(&ptr->head.users); found = true; break; } if (!found) { if (tomoyo_memory_ok(entry)) { atomic_set(&entry->head.users, 1); list_add(&entry->head.list, &tomoyo_condition_list); } else { found = true; ptr = NULL; } } mutex_unlock(&tomoyo_policy_lock); out: if (found) { tomoyo_del_condition(&entry->head.list); kfree(entry); entry = ptr; } return entry; } /** * tomoyo_get_transit_preference - Parse domain transition preference for execve(). * * @param: Pointer to "struct tomoyo_acl_param". * @e: Pointer to "struct tomoyo_condition". * * Returns the condition string part. */ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param, struct tomoyo_condition *e) { char * const pos = param->data; bool flag; if (*pos == '<') { e->transit = tomoyo_get_domainname(param); goto done; } { char *cp = strchr(pos, ' '); if (cp) *cp = '\0'; flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || !strcmp(pos, "initialize") || !strcmp(pos, "reset") || !strcmp(pos, "child") || !strcmp(pos, "parent"); if (cp) *cp = ' '; } if (!flag) return pos; e->transit = tomoyo_get_name(tomoyo_read_token(param)); done: if (e->transit) return param->data; /* * Return a bad read-only condition string that will let * tomoyo_get_condition() return NULL. */ return "/"; } /** * tomoyo_get_condition - Parse condition part. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. */ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param) { struct tomoyo_condition *entry = NULL; struct tomoyo_condition_element *condp = NULL; struct tomoyo_number_union *numbers_p = NULL; struct tomoyo_name_union *names_p = NULL; struct tomoyo_argv *argv = NULL; struct tomoyo_envp *envp = NULL; struct tomoyo_condition e = { }; char * const start_of_string = tomoyo_get_transit_preference(param, &e); char * const end_of_string = start_of_string + strlen(start_of_string); char *pos; rerun: pos = start_of_string; while (1) { u8 left = -1; u8 right = -1; char *left_word = pos; char *cp; char *right_word; bool is_not; if (!*left_word) break; /* * Since left-hand condition does not allow use of "path_group" * or "number_group" and environment variable's names do not * accept '=', it is guaranteed that the original line consists * of one or more repetition of $left$operator$right blocks * where "$left is free from '=' and ' '" and "$operator is * either '=' or '!='" and "$right is free from ' '". * Therefore, we can reconstruct the original line at the end * of dry run even if we overwrite $operator with '\0'. */ cp = strchr(pos, ' '); if (cp) { *cp = '\0'; /* Will restore later. */ pos = cp + 1; } else { pos = ""; } right_word = strchr(left_word, '='); if (!right_word || right_word == left_word) goto out; is_not = *(right_word - 1) == '!'; if (is_not) *(right_word++ - 1) = '\0'; /* Will restore later. */ else if (*(right_word + 1) != '=') *right_word++ = '\0'; /* Will restore later. */ else goto out; dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word, is_not ? "!" : "", right_word); if (!strcmp(left_word, "grant_log")) { if (entry) { if (is_not || entry->grant_log != TOMOYO_GRANTLOG_AUTO) goto out; else if (!strcmp(right_word, "yes")) entry->grant_log = TOMOYO_GRANTLOG_YES; else if (!strcmp(right_word, "no")) entry->grant_log = TOMOYO_GRANTLOG_NO; else goto out; } continue; } if (!strncmp(left_word, "exec.argv[", 10)) { if (!argv) { e.argc++; e.condc++; } else { e.argc--; e.condc--; left = TOMOYO_ARGV_ENTRY; argv->is_not = is_not; if (!tomoyo_parse_argv(left_word + 10, right_word, argv++)) goto out; } goto store_value; } if (!strncmp(left_word, "exec.envp[\"", 11)) { if (!envp) { e.envc++; e.condc++; } else { e.envc--; e.condc--; left = TOMOYO_ENVP_ENTRY; envp->is_not = is_not; if (!tomoyo_parse_envp(left_word + 11, right_word, envp++)) goto out; } goto store_value; } left = tomoyo_condition_type(left_word); dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word, left); if (left == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; left = TOMOYO_NUMBER_UNION; param->data = left_word; if (*left_word == '@' || !tomoyo_parse_number_union(param, numbers_p++)) goto out; } } if (!condp) e.condc++; else e.condc--; if (left == TOMOYO_EXEC_REALPATH || left == TOMOYO_SYMLINK_TARGET) { if (!names_p) { e.names_count++; } else { e.names_count--; right = TOMOYO_NAME_UNION; param->data = right_word; if (!tomoyo_parse_name_union_quoted(param, names_p++)) goto out; } goto store_value; } right = tomoyo_condition_type(right_word); if (right == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; right = TOMOYO_NUMBER_UNION; param->data = right_word; if (!tomoyo_parse_number_union(param, numbers_p++)) goto out; } } store_value: if (!condp) { dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n", __LINE__, left, right, !is_not); continue; } condp->left = left; condp->right = right; condp->equals = !is_not; dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n", __LINE__, condp->left, condp->right, condp->equals); condp++; } dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n", __LINE__, e.condc, e.numbers_count, e.names_count, e.argc, e.envc); if (entry) { BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc | e.condc); return tomoyo_commit_condition(entry); } e.size = sizeof(*entry) + e.condc * sizeof(struct tomoyo_condition_element) + e.numbers_count * sizeof(struct tomoyo_number_union) + e.names_count * sizeof(struct tomoyo_name_union) + e.argc * sizeof(struct tomoyo_argv) + e.envc * sizeof(struct tomoyo_envp); entry = kzalloc(e.size, GFP_NOFS); if (!entry) goto out2; *entry = e; e.transit = NULL; condp = (struct tomoyo_condition_element *) (entry + 1); numbers_p = (struct tomoyo_number_union *) (condp + e.condc); names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count); argv = (struct tomoyo_argv *) (names_p + e.names_count); envp = (struct tomoyo_envp *) (argv + e.argc); { bool flag = false; for (pos = start_of_string; pos < end_of_string; pos++) { if (*pos) continue; if (flag) /* Restore " ". */ *pos = ' '; else if (*(pos + 1) == '=') /* Restore "!=". */ *pos = '!'; else /* Restore "=". */ *pos = '='; flag = !flag; } } goto rerun; out: dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); if (entry) { tomoyo_del_condition(&entry->head.list); kfree(entry); } out2: tomoyo_put_name(e.transit); return NULL; } /** * tomoyo_get_attributes - Revalidate "struct inode". * * @obj: Pointer to "struct tomoyo_obj_info". * * Returns nothing. */ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) { u8 i; struct dentry *dentry = NULL; for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct inode *inode; switch (i) { case TOMOYO_PATH1: dentry = obj->path1.dentry; if (!dentry) continue; break; case TOMOYO_PATH2: dentry = obj->path2.dentry; if (!dentry) continue; break; default: if (!dentry) continue; dentry = dget_parent(dentry); break; } inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->dev = inode->i_sb->s_dev; stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */ dput(dentry); } } /** * tomoyo_condition - Check condition part. * * @r: Pointer to "struct tomoyo_request_info". * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond) { u32 i; unsigned long min_v[2] = { 0, 0 }; unsigned long max_v[2] = { 0, 0 }; const struct tomoyo_condition_element *condp; const struct tomoyo_number_union *numbers_p; const struct tomoyo_name_union *names_p; const struct tomoyo_argv *argv; const struct tomoyo_envp *envp; struct tomoyo_obj_info *obj; u16 condc; u16 argc; u16 envc; struct linux_binprm *bprm = NULL; if (!cond) return true; condc = cond->condc; argc = cond->argc; envc = cond->envc; obj = r->obj; if (r->ee) bprm = r->ee->bprm; if (!bprm && (argc || envc)) return false; condp = (struct tomoyo_condition_element *) (cond + 1); numbers_p = (const struct tomoyo_number_union *) (condp + condc); names_p = (const struct tomoyo_name_union *) (numbers_p + cond->numbers_count); argv = (const struct tomoyo_argv *) (names_p + cond->names_count); envp = (const struct tomoyo_envp *) (argv + argc); for (i = 0; i < condc; i++) { const bool match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; bool is_bitop[2] = { false, false }; u8 j; condp++; /* Check argv[] and envp[] later. */ if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) continue; /* Check string expressions. */ if (right == TOMOYO_NAME_UNION) { const struct tomoyo_name_union *ptr = names_p++; struct tomoyo_path_info *symlink; struct tomoyo_execve *ee; struct file *file; switch (left) { case TOMOYO_SYMLINK_TARGET: symlink = obj ? obj->symlink_target : NULL; if (!symlink || !tomoyo_compare_name_union(symlink, ptr) == match) goto out; break; case TOMOYO_EXEC_REALPATH: ee = r->ee; file = ee ? ee->bprm->file : NULL; if (!tomoyo_scan_exec_realpath(file, ptr, match)) goto out; break; } continue; } /* Check numeric or bit-op expressions. */ for (j = 0; j < 2; j++) { const u8 index = j ? right : left; unsigned long value = 0; switch (index) { case TOMOYO_TASK_UID: value = from_kuid(&init_user_ns, current_uid()); break; case TOMOYO_TASK_EUID: value = from_kuid(&init_user_ns, current_euid()); break; case TOMOYO_TASK_SUID: value = from_kuid(&init_user_ns, current_suid()); break; case TOMOYO_TASK_FSUID: value = from_kuid(&init_user_ns, current_fsuid()); break; case TOMOYO_TASK_GID: value = from_kgid(&init_user_ns, current_gid()); break; case TOMOYO_TASK_EGID: value = from_kgid(&init_user_ns, current_egid()); break; case TOMOYO_TASK_SGID: value = from_kgid(&init_user_ns, current_sgid()); break; case TOMOYO_TASK_FSGID: value = from_kgid(&init_user_ns, current_fsgid()); break; case TOMOYO_TASK_PID: value = tomoyo_sys_getpid(); break; case TOMOYO_TASK_PPID: value = tomoyo_sys_getppid(); break; case TOMOYO_TYPE_IS_SOCKET: value = S_IFSOCK; break; case TOMOYO_TYPE_IS_SYMLINK: value = S_IFLNK; break; case TOMOYO_TYPE_IS_FILE: value = S_IFREG; break; case TOMOYO_TYPE_IS_BLOCK_DEV: value = S_IFBLK; break; case TOMOYO_TYPE_IS_DIRECTORY: value = S_IFDIR; break; case TOMOYO_TYPE_IS_CHAR_DEV: value = S_IFCHR; break; case TOMOYO_TYPE_IS_FIFO: value = S_IFIFO; break; case TOMOYO_MODE_SETUID: value = S_ISUID; break; case TOMOYO_MODE_SETGID: value = S_ISGID; break; case TOMOYO_MODE_STICKY: value = S_ISVTX; break; case TOMOYO_MODE_OWNER_READ: value = 0400; break; case TOMOYO_MODE_OWNER_WRITE: value = 0200; break; case TOMOYO_MODE_OWNER_EXECUTE: value = 0100; break; case TOMOYO_MODE_GROUP_READ: value = 0040; break; case TOMOYO_MODE_GROUP_WRITE: value = 0020; break; case TOMOYO_MODE_GROUP_EXECUTE: value = 0010; break; case TOMOYO_MODE_OTHERS_READ: value = 0004; break; case TOMOYO_MODE_OTHERS_WRITE: value = 0002; break; case TOMOYO_MODE_OTHERS_EXECUTE: value = 0001; break; case TOMOYO_EXEC_ARGC: if (!bprm) goto out; value = bprm->argc; break; case TOMOYO_EXEC_ENVC: if (!bprm) goto out; value = bprm->envc; break; case TOMOYO_NUMBER_UNION: /* Fetch values later. */ break; default: if (!obj) goto out; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } { u8 stat_index; struct tomoyo_mini_stat *stat; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH1_GID: case TOMOYO_PATH1_INO: case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH1_MINOR: case TOMOYO_PATH1_TYPE: case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH1_PERM: stat_index = TOMOYO_PATH1; break; case TOMOYO_PATH2_UID: case TOMOYO_PATH2_GID: case TOMOYO_PATH2_INO: case TOMOYO_PATH2_MAJOR: case TOMOYO_PATH2_MINOR: case TOMOYO_PATH2_TYPE: case TOMOYO_PATH2_DEV_MAJOR: case TOMOYO_PATH2_DEV_MINOR: case TOMOYO_PATH2_PERM: stat_index = TOMOYO_PATH2; break; case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH1_PARENT_PERM: stat_index = TOMOYO_PATH1_PARENT; break; case TOMOYO_PATH2_PARENT_UID: case TOMOYO_PATH2_PARENT_GID: case TOMOYO_PATH2_PARENT_INO: case TOMOYO_PATH2_PARENT_PERM: stat_index = TOMOYO_PATH2_PARENT; break; default: goto out; } if (!obj->stat_valid[stat_index]) goto out; stat = &obj->stat[stat_index]; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH2_UID: case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH2_PARENT_UID: value = from_kuid(&init_user_ns, stat->uid); break; case TOMOYO_PATH1_GID: case TOMOYO_PATH2_GID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH2_PARENT_GID: value = from_kgid(&init_user_ns, stat->gid); break; case TOMOYO_PATH1_INO: case TOMOYO_PATH2_INO: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH2_PARENT_INO: value = stat->ino; break; case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH2_MAJOR: value = MAJOR(stat->dev); break; case TOMOYO_PATH1_MINOR: case TOMOYO_PATH2_MINOR: value = MINOR(stat->dev); break; case TOMOYO_PATH1_TYPE: case TOMOYO_PATH2_TYPE: value = stat->mode & S_IFMT; break; case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH2_DEV_MAJOR: value = MAJOR(stat->rdev); break; case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH2_DEV_MINOR: value = MINOR(stat->rdev); break; case TOMOYO_PATH1_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PARENT_PERM: value = stat->mode & S_IALLUGO; break; } } break; } max_v[j] = value; min_v[j] = value; switch (index) { case TOMOYO_MODE_SETUID: case TOMOYO_MODE_SETGID: case TOMOYO_MODE_STICKY: case TOMOYO_MODE_OWNER_READ: case TOMOYO_MODE_OWNER_WRITE: case TOMOYO_MODE_OWNER_EXECUTE: case TOMOYO_MODE_GROUP_READ: case TOMOYO_MODE_GROUP_WRITE: case TOMOYO_MODE_GROUP_EXECUTE: case TOMOYO_MODE_OTHERS_READ: case TOMOYO_MODE_OTHERS_WRITE: case TOMOYO_MODE_OTHERS_EXECUTE: is_bitop[j] = true; } } if (left == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; min_v[0] = ptr->values[0]; max_v[0] = ptr->values[1]; } if (right == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; if (ptr->group) { if (tomoyo_number_matches_group(min_v[0], max_v[0], ptr->group) == match) continue; } else { if ((min_v[0] <= ptr->values[1] && max_v[0] >= ptr->values[0]) == match) continue; } goto out; } /* * Bit operation is valid only when counterpart value * represents permission. */ if (is_bitop[0] && is_bitop[1]) { goto out; } else if (is_bitop[0]) { switch (right) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } else if (is_bitop[1]) { switch (left) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } /* Normal value range comparison. */ if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match) continue; out: return false; } /* Check argv[] and envp[] now. */ if (r->ee && (argc || envc)) return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp); return true; }
19 732 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
29 29 29 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 // SPDX-License-Identifier: GPL-2.0 /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ #include <linux/sysctl.h> #include <linux/seqlock.h> #include <linux/init.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/tcp.h> #include <net/udp.h> #include <net/cipso_ipv4.h> #include <net/ping.h> #include <net/protocol.h> #include <net/netevent.h> static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; static int ip_ttl_max = 255; static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT; static unsigned long ip_ping_group_range_min[] = { 0, 0 }; static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; static int tcp_ecn_mode_max = 2; static u32 icmp_errors_extension_mask_all = GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, unsigned int low, unsigned int high) { bool same_parity = !((low ^ high) & 1); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low); } /* Validate changes from /proc interface. */ static int ipv4_local_port_range(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int ret; int range[2]; struct ctl_table tmp = { .data = &range, .maxlen = sizeof(range), .mode = table->mode, .extra1 = &ip_local_port_range_min, .extra2 = &ip_local_port_range_max, }; inet_get_local_port_range(net, &range[0], &range[1]); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { /* Ensure that the upper limit is not smaller than the lower, * and that the lower does not encroach upon the privileged * port limit. */ if ((range[1] < range[0]) || (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock))) ret = -EINVAL; else set_local_port_range(net, range[0], range[1]); } return ret; } /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); int ret; int pports; int range[2]; struct ctl_table tmp = { .data = &pports, .maxlen = sizeof(pports), .mode = table->mode, .extra1 = &ip_privileged_port_min, .extra2 = &ip_privileged_port_max, }; pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { inet_get_local_port_range(net, &range[0], &range[1]); /* Ensure that the local port range doesn't overlap with the * privileged port range. */ if (range[0] < pports) ret = -EINVAL; else WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports); } return ret; } static void inet_get_ping_group_range_table(const struct ctl_table *table, kgid_t *low, kgid_t *high) { kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } /* Update system visible IP port range */ static void set_ping_group_range(const struct ctl_table *table, kgid_t low, kgid_t high) { kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); write_seqlock(&net->ipv4.ping_group_range.lock); data[0] = low; data[1] = high; write_sequnlock(&net->ipv4.ping_group_range.lock); } /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; unsigned long urange[2]; kgid_t low, high; struct ctl_table tmp = { .data = &urange, .maxlen = sizeof(urange), .mode = table->mode, .extra1 = &ip_ping_group_range_min, .extra2 = &ip_ping_group_range_max, }; inet_get_ping_group_range_table(table, &low, &high); urange[0] = from_kgid_munged(user_ns, low); urange[1] = from_kgid_munged(user_ns, high); ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); if (!gid_valid(low) || !gid_valid(high)) return -EINVAL; if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } set_ping_group_range(table, low, high); } return ret; } static int ipv4_fwd_update_priority(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv4.sysctl_ip_fwd_update_priority); ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net); return ret; } static int proc_tcp_congestion_control(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, .maxlen = TCP_CA_NAME_MAX, }; int ret; tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_default_congestion_control(net, val); return ret; } static int proc_tcp_available_congestion_control(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_allowed_congestion_control(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); return ret; } static int sscanf_key(char *buf, __le32 *key) { u32 user_key[4]; int i, ret = 0; if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1, user_key + 2, user_key + 3) != 4) { ret = -EINVAL; } else { for (i = 0; i < ARRAY_SIZE(user_key); i++) key[i] = cpu_to_le32(user_key[i]); } pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", user_key[0], user_key[1], user_key[2], user_key[3], buf, ret); return ret; } static int proc_tcp_fastopen_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); /* maxlen to print the list of keys in hex (*2), with dashes * separating doublewords and a comma in between keys. */ struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)) }; u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)]; __le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)]; char *backup_data; int ret, i = 0, off = 0, n_keys; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key); if (!n_keys) { memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); n_keys = 1; } for (i = 0; i < n_keys * 4; i++) user_key[i] = le32_to_cpu(key[i]); for (i = 0; i < n_keys; i++) { off += snprintf(tbl.data + off, tbl.maxlen - off, "%08x-%08x-%08x-%08x", user_key[i * 4], user_key[i * 4 + 1], user_key[i * 4 + 2], user_key[i * 4 + 3]); if (WARN_ON_ONCE(off >= tbl.maxlen - 1)) break; if (i + 1 < n_keys) off += snprintf(tbl.data + off, tbl.maxlen - off, ","); } ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { backup_data = strchr(tbl.data, ','); if (backup_data) { *backup_data = '\0'; backup_data++; } if (sscanf_key(tbl.data, key)) { ret = -EINVAL; goto bad_key; } if (backup_data) { if (sscanf_key(backup_data, key + 4)) { ret = -EINVAL; goto bad_key; } } tcp_fastopen_reset_cipher(net, NULL, key, backup_data ? key + 4 : NULL); } bad_key: kfree(tbl.data); return ret; } static int proc_tfo_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } static int proc_tcp_available_ulp(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_tcp_ehash_entries(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_child_ehash_entries); struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; int tcp_ehash_entries; struct ctl_table tbl; tcp_ehash_entries = hinfo->ehash_mask + 1; /* A negative number indicates that the child netns * shares the global ehash. */ if (!net_eq(net, &init_net) && !hinfo->pernet) tcp_ehash_entries *= -1; memset(&tbl, 0, sizeof(tbl)); tbl.data = &tcp_ehash_entries; tbl.maxlen = sizeof(int); return proc_dointvec(&tbl, write, buffer, lenp, ppos); } static int proc_udp_hash_entries(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_udp_child_hash_entries); int udp_hash_entries; struct ctl_table tbl; udp_hash_entries = net->ipv4.udp_table->mask + 1; /* A negative number indicates that the child netns * shares the global udp_table. */ if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table) udp_hash_entries *= -1; memset(&tbl, 0, sizeof(tbl)); tbl.data = &udp_hash_entries; tbl.maxlen = sizeof(int); return proc_dointvec(&tbl, write, buffer, lenp, ppos); } #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_fib_multipath_hash_policy); int ret; ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } static int proc_fib_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv4.sysctl_fib_multipath_hash_fields); ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } static u32 proc_fib_multipath_hash_rand_seed __ro_after_init; static void proc_fib_multipath_hash_init_rand_seed(void) { get_random_bytes(&proc_fib_multipath_hash_rand_seed, sizeof(proc_fib_multipath_hash_rand_seed)); } static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) { struct sysctl_fib_multipath_hash_seed new = { .user_seed = user_seed, .mp_seed = (user_seed ? user_seed : proc_fib_multipath_hash_rand_seed), }; WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new); } static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sysctl_fib_multipath_hash_seed *mphs; struct net *net = table->data; struct ctl_table tmp; u32 user_seed; int ret; mphs = &net->ipv4.sysctl_fib_multipath_hash_seed; user_seed = mphs->user_seed; tmp = *table; tmp.data = &user_seed; ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { proc_fib_multipath_hash_set_seed(net, user_seed); call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); } return ret; } #else static void proc_fib_multipath_hash_init_rand_seed(void) { } static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) { } #endif static struct ctl_table ipv4_table[] = { { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "inet_peer_minttl", .data = &inet_peer_minttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "inet_peer_maxttl", .data = &inet_peer_maxttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_mem", .maxlen = sizeof(sysctl_tcp_mem), .data = &sysctl_tcp_mem, .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", .data = &cipso_v4_cache_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_cache_bucket_size", .data = &cipso_v4_cache_bucketsize, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_rbm_optfmt", .data = &cipso_v4_rbm_optfmt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_rbm_strictvalid", .data = &cipso_v4_rbm_strictvalid, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, .mode = 0444, .proc_handler = proc_tcp_available_ulp, }, { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "fib_sync_mem", .data = &sysctl_fib_sync_mem, .maxlen = sizeof(sysctl_fib_sync_mem), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, }; static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_max_tw_buckets", .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_enable_probe", .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_errors_extension_mask", .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &icmp_errors_extension_mask_all, }, { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "icmp_ratemask", .data = &init_net.ipv4.sysctl_icmp_ratemask, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "icmp_msgs_per_sec", .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "icmp_msgs_burst", .data = &init_net.ipv4.sysctl_icmp_msgs_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "ping_group_range", .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, .mode = 0644, .proc_handler = ipv4_ping_group_range, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "raw_l3mdev_accept", .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, { .procname = "tcp_ecn_option", .data = &init_net.ipv4.sysctl_tcp_ecn_option, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_ecn_option_beacon", .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_dynaddr", .data = &init_net.ipv4.sysctl_ip_dynaddr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_early_demux", .data = &init_net.ipv4.sysctl_ip_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "udp_early_demux", .data = &init_net.ipv4.sysctl_udp_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "nexthop_compat_mode", .data = &init_net.ipv4.sysctl_nexthop_compat_mode, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, { .procname = "ip_local_port_range", .maxlen = 0, .data = &init_net, .mode = 0644, .proc_handler = ipv4_local_port_range, }, { .procname = "ip_local_reserved_ports", .data = &init_net.ipv4.sysctl_local_reserved_ports, .maxlen = 65536, .mode = 0644, .proc_handler = proc_do_large_bitmap, }, { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_use_pmtu", .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_update_priority", .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_autobind_reuse", .data = &init_net.ipv4.sysctl_ip_autobind_reuse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fwmark_accept", .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "tcp_l3mdev_accept", .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_mtu_probing", .data = &init_net.ipv4.sysctl_tcp_mtu_probing, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_base_mss", .data = &init_net.ipv4.sysctl_tcp_base_mss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_min_snd_mss", .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_min_snd_mss_min, .extra2 = &tcp_min_snd_mss_max, }, { .procname = "tcp_mtu_probe_floor", .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_min_snd_mss_min, .extra2 = &tcp_min_snd_mss_max, }, { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", .data = &init_net.ipv4.sysctl_igmp_llm_reports, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "igmp_max_memberships", .data = &init_net.ipv4.sysctl_igmp_max_memberships, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "igmp_max_msf", .data = &init_net.ipv4.sysctl_igmp_max_msf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #ifdef CONFIG_IP_MULTICAST { .procname = "igmp_qrv", .data = &init_net.ipv4.sysctl_igmp_qrv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, #endif { .procname = "tcp_congestion_control", .data = &init_net.ipv4.tcp_congestion_control, .mode = 0644, .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, { .procname = "tcp_available_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0444, .proc_handler = proc_tcp_available_congestion_control, }, { .procname = "tcp_allowed_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_keepalive_probes", .data = &init_net.ipv4.sysctl_tcp_keepalive_probes, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_keepalive_intvl", .data = &init_net.ipv4.sysctl_tcp_keepalive_intvl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_syn_retries", .data = &init_net.ipv4.sysctl_tcp_syn_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = &tcp_syn_retries_min, .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", .data = &init_net.ipv4.sysctl_tcp_synack_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_SYN_COOKIES { .procname = "tcp_syncookies", .data = &init_net.ipv4.sysctl_tcp_syncookies, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #endif { .procname = "tcp_migrate_req", .data = &init_net.ipv4.sysctl_tcp_migrate_req, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "tcp_reordering", .data = &init_net.ipv4.sysctl_tcp_reordering, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_retries1", .data = &init_net.ipv4.sysctl_tcp_retries1, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_retr1_max }, { .procname = "tcp_retries2", .data = &init_net.ipv4.sysctl_tcp_retries2, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fin_timeout", .data = &init_net.ipv4.sysctl_tcp_fin_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_notsent_lowat", .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec, }, { .procname = "tcp_tw_reuse", .data = &init_net.ipv4.sysctl_tcp_tw_reuse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_tw_reuse_delay", .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &tcp_tw_reuse_delay_max, }, { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_fastopen", .data = &init_net.ipv4.sysctl_tcp_fastopen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_fastopen_key", .mode = 0600, .data = &init_net.ipv4.sysctl_tcp_fastopen, /* maxlen to print the list of keys in hex (*2), with dashes * separating doublewords and a comma in between keys. */ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)), .proc_handler = proc_tcp_fastopen_key, }, { .procname = "tcp_fastopen_blackhole_timeout_sec", .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_tfo_blackhole_detect_timeout, .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "fib_multipath_hash_fields", .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_fib_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &fib_multipath_hash_fields_all_mask, }, { .procname = "fib_multipath_hash_seed", .data = &init_net, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_fib_multipath_hash_seed, }, #endif { .procname = "ip_unprivileged_port_start", .maxlen = sizeof(int), .data = &init_net.ipv4.sysctl_ip_prot_sock, .mode = 0644, .proc_handler = ipv4_privileged_ports, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "udp_l3mdev_accept", .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_sack", .data = &init_net.ipv4.sysctl_tcp_sack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_window_scaling", .data = &init_net.ipv4.sysctl_tcp_window_scaling, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_timestamps", .data = &init_net.ipv4.sysctl_tcp_timestamps, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_retrans", .data = &init_net.ipv4.sysctl_tcp_early_retrans, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, }, { .procname = "tcp_recovery", .data = &init_net.ipv4.sysctl_tcp_recovery, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_thin_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_slow_start_after_idle", .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_retrans_collapse", .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_stdurg", .data = &init_net.ipv4.sysctl_tcp_stdurg, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rfc1337", .data = &init_net.ipv4.sysctl_tcp_rfc1337, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_abort_on_overflow", .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fack", .data = &init_net.ipv4.sysctl_tcp_fack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_max_reordering", .data = &init_net.ipv4.sysctl_tcp_max_reordering, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_dsack", .data = &init_net.ipv4.sysctl_tcp_dsack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_app_win", .data = &init_net.ipv4.sysctl_tcp_app_win, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_app_win_max, }, { .procname = "tcp_adv_win_scale", .data = &init_net.ipv4.sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_adv_win_scale_min, .extra2 = &tcp_adv_win_scale_max, }, { .procname = "tcp_frto", .data = &init_net.ipv4.sysctl_tcp_frto, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_metrics_save", .data = &init_net.ipv4.sysctl_tcp_nometrics_save, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_ssthresh_metrics_save", .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rcvbuf_low_rtt", .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_workaround_signed_windows", .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_limit_output_bytes", .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_challenge_ack_limit", .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_min_tso_segs", .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_tso_rtt_log", .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_min_rtt_wlen", .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &one_day_secs }, { .procname = "tcp_autocorking", .data = &init_net.ipv4.sysctl_tcp_autocorking, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_invalid_ratelimit", .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "tcp_pacing_ss_ratio", .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_pacing_ca_ratio", .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_wmem", .data = &init_net.ipv4.sysctl_tcp_wmem, .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rmem", .data = &init_net.ipv4.sysctl_tcp_rmem, .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_comp_sack_delay_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_comp_sack_rtt_percent", .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_comp_sack_slack_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "tcp_backlog_ack_defer", .data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_reflect_tos", .data = &init_net.ipv4.sysctl_tcp_reflect_tos, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_ehash_entries", .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, .mode = 0444, .proc_handler = proc_tcp_ehash_entries, }, { .procname = "tcp_child_ehash_entries", .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_child_ehash_entries_max, }, { .procname = "udp_hash_entries", .data = &init_net.ipv4.sysctl_udp_child_hash_entries, .mode = 0444, .proc_handler = proc_udp_hash_entries, }, { .procname = "udp_child_hash_entries", .data = &init_net.ipv4.sysctl_udp_child_hash_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &udp_child_hash_entries_max, }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, { .procname = "udp_wmem_min", .data = &init_net.ipv4.sysctl_udp_wmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_plb_enabled", .data = &init_net.ipv4.sysctl_tcp_plb_enabled, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_plb_idle_rehash_rounds", .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_plb_max_rounds, }, { .procname = "tcp_plb_rehash_rounds", .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_plb_max_rounds, }, { .procname = "tcp_plb_suspend_rto_sec", .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_plb_cong_thresh", .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_plb_max_cong_thresh, }, { .procname = "tcp_syn_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_syn_linear_timeouts_max, }, { .procname = "tcp_shrink_window", .data = &init_net.ipv4.sysctl_tcp_shrink_window, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_pingpong_thresh", .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rto_min_us", .data = &init_net.ipv4.sysctl_tcp_rto_min_us, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rto_max_ms", .data = &init_net.ipv4.sysctl_tcp_rto_max_ms, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE_THOUSAND, .extra2 = &tcp_rto_max_max, }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) { size_t table_size = ARRAY_SIZE(ipv4_net_table); struct ctl_table *table; table = ipv4_net_table; if (!net_eq(net, &init_net)) { int i; table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (!table) goto err_alloc; for (i = 0; i < table_size; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net */ table[i].data += (void *)net - (void *)&init_net; } else { /* Entries without data pointer are global; * Make them read-only in non-init_net ns */ table[i].mode &= ~0222; } } } net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table, table_size); if (!net->ipv4.ipv4_hdr) goto err_reg; net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; proc_fib_multipath_hash_set_seed(net, 0); return 0; err_ports: unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static __net_exit void ipv4_sysctl_exit_net(struct net *net) { const struct ctl_table *table; kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); } static __net_initdata struct pernet_operations ipv4_sysctl_ops = { .init = ipv4_sysctl_init_net, .exit = ipv4_sysctl_exit_net, }; static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (!hdr) return -ENOMEM; proc_fib_multipath_hash_init_rand_seed(); if (register_pernet_subsys(&ipv4_sysctl_ops)) { unregister_net_sysctl_table(hdr); return -ENOMEM; } return 0; } __initcall(sysctl_ipv4_init);
29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/slab.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, }; static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static bool forward __read_mostly = true; module_param(forward, bool, 0000); static int iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; int err; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } static int __net_init iptable_filter_net_init(struct net *net) { if (!forward) return iptable_filter_table_init(net); return 0; } static void __net_exit iptable_filter_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) { ipt_unregister_table_exit(net, "filter"); } static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .pre_exit = iptable_filter_net_pre_exit, .exit = iptable_filter_net_exit, }; static int __init iptable_filter_init(void) { int ret = xt_register_template(&packet_filter, iptable_filter_table_init); if (ret < 0) return ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); if (IS_ERR(filter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); } ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) { xt_unregister_template(&packet_filter); kfree(filter_ops); return ret; } return 0; } static void __exit iptable_filter_fini(void) { unregister_pernet_subsys(&iptable_filter_net_ops); xt_unregister_template(&packet_filter); kfree(filter_ops); } module_init(iptable_filter_init); module_exit(iptable_filter_fini);
9 4 5 5 26 26 1 26 6 4 10 10 23 1 1 7 2 1 11 7 2 5 1 3 30 30 28 19 2 2 7 3 24 21 3 21 11 372 375 186 194 1 1 2 1 2 1 2 2 2 2 1 1 2 1 1 2 2 2 1 5 1 4 1 3 3 2 1 4 1 7 2 1 2 5 1 4 2 1 2 2 161 4 5 2 2 2 3 2 3 1 2 2 3 2 2 3 1 1 3 3 2 2 2 3 2 4 3 5 2 3 4 3 2 2 1 2 2 1 3 5 2 6 3 1 4 1 5 2 2 3 6 3 1 21 10 2 1 20 2 2 7 23 2 4 54 3 478 3 101 364 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <net/psp.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = psp_sk_overhead(sk) + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt))); sk_dst_reset(sk); return opt; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int retv = -ENOPROTOOPT; int val, valbool; if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) return -EINVAL; if (val != valbool) return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) return retv; if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; case IPV6_MTU: if (optlen < sizeof(int)) return -EINVAL; if (val && val < IPV6_MIN_MTU) return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); return 0; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 1) return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; case IPV6_AUTOFLOWLABEL: inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; case IPV6_RECVERR: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RECVERR6, sk, valbool); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) return -EINVAL; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(SNDFLOW, sk, valbool); return 0; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (val) { struct net_device *dev; int bound_dev_if, midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); return -ENODEV; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != val && (!midx || midx != bound_dev_if)) return -EINVAL; } WRITE_ONCE(np->mcast_oif, val); return 0; case IPV6_UNICAST_IF: { struct net_device *dev; int ifindex; if (optlen != sizeof(int)) return -EINVAL; ifindex = (__force int)ntohl((__force __be32)val); if (!ifindex) { WRITE_ONCE(np->ucast_oif, 0); return 0; } dev = dev_get_by_index(net, ifindex); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); if (READ_ONCE(sk->sk_bound_dev_if)) return -EINVAL; WRITE_ONCE(np->ucast_oif, ifindex); return 0; } } sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); if (retv == 0) inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: val = np->rxopt.bits.hopopts; break; case IPV6_2292HOPOPTS: val = np->rxopt.bits.ohopopts; break; case IPV6_RECVDSTOPTS: val = np->rxopt.bits.dstopts; break; case IPV6_2292DSTOPTS: val = np->rxopt.bits.odstopts; break; case IPV6_TCLASS: val = np->tclass; break; case IPV6_RECVTCLASS: val = np->rxopt.bits.rxtclass; break; case IPV6_FLOWINFO: val = np->rxopt.bits.rxflow; break; case IPV6_RECVPATHMTU: val = np->rxopt.bits.rxpmtu; break; case IPV6_PATHMTU: { struct dst_entry *dst; struct ip6_mtuinfo mtuinfo; if (len < sizeof(mtuinfo)) return -EINVAL; len = sizeof(mtuinfo); memset(&mtuinfo, 0, sizeof(mtuinfo)); rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) mtuinfo.ip6m_mtu = dst_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; } case IPV6_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) val = READ_ONCE(np->hop_limit); else val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = ip6_dst_hoplimit(dst); rcu_read_unlock(); } if (val < 0) val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit); break; } case IPV6_MULTICAST_LOOP: val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; int flags; if (len < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) return -EINVAL; len = sizeof(freq); flags = freq.flr_flags; memset(&freq, 0, sizeof(freq)); val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; } case IPV6_ADDR_PREFERENCES: { u8 srcprefs = READ_ONCE(np->srcprefs); val = 0; if (srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; else if (srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } if (srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; } case IPV6_MINHOPCOUNT: val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; case IPV6_ROUTER_ALERT: val = inet6_test_bit(RTALERT, sk); break; case IPV6_ROUTER_ALERT_ISOLATE: val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } #endif return err; } EXPORT_SYMBOL(ipv6_getsockopt);
6 6 6 6 6 6 6 6 2 24 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/cfm_bridge.h> #include <uapi/linux/cfm_bridge.h> #include "br_private_cfm.h" static struct br_cfm_mep *br_mep_find(struct net_bridge *br, u32 instance) { struct br_cfm_mep *mep; hlist_for_each_entry(mep, &br->mep_list, head) if (mep->instance == instance) return mep; return NULL; } static struct br_cfm_mep *br_mep_find_ifindex(struct net_bridge *br, u32 ifindex) { struct br_cfm_mep *mep; hlist_for_each_entry_rcu(mep, &br->mep_list, head, lockdep_rtnl_is_held()) if (mep->create.ifindex == ifindex) return mep; return NULL; } static struct br_cfm_peer_mep *br_peer_mep_find(struct br_cfm_mep *mep, u32 mepid) { struct br_cfm_peer_mep *peer_mep; hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head, lockdep_rtnl_is_held()) if (peer_mep->mepid == mepid) return peer_mep; return NULL; } static struct net_bridge_port *br_mep_get_port(struct net_bridge *br, u32 ifindex) { struct net_bridge_port *port; list_for_each_entry(port, &br->port_list, list) if (port->dev->ifindex == ifindex) return port; return NULL; } /* Calculate the CCM interval in us. */ static u32 interval_to_us(enum br_cfm_ccm_interval interval) { switch (interval) { case BR_CFM_CCM_INTERVAL_NONE: return 0; case BR_CFM_CCM_INTERVAL_3_3_MS: return 3300; case BR_CFM_CCM_INTERVAL_10_MS: return 10 * 1000; case BR_CFM_CCM_INTERVAL_100_MS: return 100 * 1000; case BR_CFM_CCM_INTERVAL_1_SEC: return 1000 * 1000; case BR_CFM_CCM_INTERVAL_10_SEC: return 10 * 1000 * 1000; case BR_CFM_CCM_INTERVAL_1_MIN: return 60 * 1000 * 1000; case BR_CFM_CCM_INTERVAL_10_MIN: return 10 * 60 * 1000 * 1000; } return 0; } /* Convert the interface interval to CCM PDU value. */ static u32 interval_to_pdu(enum br_cfm_ccm_interval interval) { switch (interval) { case BR_CFM_CCM_INTERVAL_NONE: return 0; case BR_CFM_CCM_INTERVAL_3_3_MS: return 1; case BR_CFM_CCM_INTERVAL_10_MS: return 2; case BR_CFM_CCM_INTERVAL_100_MS: return 3; case BR_CFM_CCM_INTERVAL_1_SEC: return 4; case BR_CFM_CCM_INTERVAL_10_SEC: return 5; case BR_CFM_CCM_INTERVAL_1_MIN: return 6; case BR_CFM_CCM_INTERVAL_10_MIN: return 7; } return 0; } /* Convert the CCM PDU value to interval on interface. */ static u32 pdu_to_interval(u32 value) { switch (value) { case 0: return BR_CFM_CCM_INTERVAL_NONE; case 1: return BR_CFM_CCM_INTERVAL_3_3_MS; case 2: return BR_CFM_CCM_INTERVAL_10_MS; case 3: return BR_CFM_CCM_INTERVAL_100_MS; case 4: return BR_CFM_CCM_INTERVAL_1_SEC; case 5: return BR_CFM_CCM_INTERVAL_10_SEC; case 6: return BR_CFM_CCM_INTERVAL_1_MIN; case 7: return BR_CFM_CCM_INTERVAL_10_MIN; } return BR_CFM_CCM_INTERVAL_NONE; } static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep) { u32 interval_us; interval_us = interval_to_us(peer_mep->mep->cc_config.exp_interval); /* Function ccm_rx_dwork must be called with 1/4 * of the configured CC 'expected_interval' * in order to detect CCM defect after 3.25 interval. */ queue_delayed_work(system_percpu_wq, &peer_mep->ccm_rx_dwork, usecs_to_jiffies(interval_us / 4)); } static void br_cfm_notify(int event, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_CFM_STATUS; br_info_notify(event, port->br, NULL, filter); } static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep) { memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status)); peer_mep->ccm_rx_count_miss = 0; ccm_rx_timer_start(peer_mep); } static void cc_peer_disable(struct br_cfm_peer_mep *peer_mep) { cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); } static struct sk_buff *ccm_frame_build(struct br_cfm_mep *mep, const struct br_cfm_cc_ccm_tx_info *const tx_info) { struct br_cfm_common_hdr *common_hdr; struct net_bridge_port *b_port; struct br_cfm_maid *maid; u8 *itu_reserved, *e_tlv; struct ethhdr *eth_hdr; struct sk_buff *skb; __be32 *status_tlv; __be32 *snumber; __be16 *mepid; skb = dev_alloc_skb(CFM_CCM_MAX_FRAME_LENGTH); if (!skb) return NULL; rcu_read_lock(); b_port = rcu_dereference(mep->b_port); if (!b_port) { kfree_skb(skb); rcu_read_unlock(); return NULL; } skb->dev = b_port->dev; rcu_read_unlock(); /* The device cannot be deleted until the work_queue functions has * completed. This function is called from ccm_tx_work_expired() * that is a work_queue functions. */ skb->protocol = htons(ETH_P_CFM); skb->priority = CFM_FRAME_PRIO; /* Ethernet header */ eth_hdr = skb_put(skb, sizeof(*eth_hdr)); ether_addr_copy(eth_hdr->h_dest, tx_info->dmac.addr); ether_addr_copy(eth_hdr->h_source, mep->config.unicast_mac.addr); eth_hdr->h_proto = htons(ETH_P_CFM); /* Common CFM Header */ common_hdr = skb_put(skb, sizeof(*common_hdr)); common_hdr->mdlevel_version = mep->config.mdlevel << 5; common_hdr->opcode = BR_CFM_OPCODE_CCM; common_hdr->flags = (mep->rdi << 7) | interval_to_pdu(mep->cc_config.exp_interval); common_hdr->tlv_offset = CFM_CCM_TLV_OFFSET; /* Sequence number */ snumber = skb_put(skb, sizeof(*snumber)); if (tx_info->seq_no_update) { *snumber = cpu_to_be32(mep->ccm_tx_snumber); mep->ccm_tx_snumber += 1; } else { *snumber = 0; } mepid = skb_put(skb, sizeof(*mepid)); *mepid = cpu_to_be16((u16)mep->config.mepid); maid = skb_put(skb, sizeof(*maid)); memcpy(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data)); /* ITU reserved (CFM_CCM_ITU_RESERVED_SIZE octets) */ itu_reserved = skb_put(skb, CFM_CCM_ITU_RESERVED_SIZE); memset(itu_reserved, 0, CFM_CCM_ITU_RESERVED_SIZE); /* Generel CFM TLV format: * TLV type: one byte * TLV value length: two bytes * TLV value: 'TLV value length' bytes */ /* Port status TLV. The value length is 1. Total of 4 bytes. */ if (tx_info->port_tlv) { status_tlv = skb_put(skb, sizeof(*status_tlv)); *status_tlv = cpu_to_be32((CFM_PORT_STATUS_TLV_TYPE << 24) | (1 << 8) | /* Value length */ (tx_info->port_tlv_value & 0xFF)); } /* Interface status TLV. The value length is 1. Total of 4 bytes. */ if (tx_info->if_tlv) { status_tlv = skb_put(skb, sizeof(*status_tlv)); *status_tlv = cpu_to_be32((CFM_IF_STATUS_TLV_TYPE << 24) | (1 << 8) | /* Value length */ (tx_info->if_tlv_value & 0xFF)); } /* End TLV */ e_tlv = skb_put(skb, sizeof(*e_tlv)); *e_tlv = CFM_ENDE_TLV_TYPE; return skb; } static void ccm_frame_tx(struct sk_buff *skb) { skb_reset_network_header(skb); dev_queue_xmit(skb); } /* This function is called with the configured CC 'expected_interval' * in order to drive CCM transmission when enabled. */ static void ccm_tx_work_expired(struct work_struct *work) { struct delayed_work *del_work; struct br_cfm_mep *mep; struct sk_buff *skb; u32 interval_us; del_work = to_delayed_work(work); mep = container_of(del_work, struct br_cfm_mep, ccm_tx_dwork); if (time_before_eq(mep->ccm_tx_end, jiffies)) { /* Transmission period has ended */ mep->cc_ccm_tx_info.period = 0; return; } skb = ccm_frame_build(mep, &mep->cc_ccm_tx_info); if (skb) ccm_frame_tx(skb); interval_us = interval_to_us(mep->cc_config.exp_interval); queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, usecs_to_jiffies(interval_us)); } /* This function is called with 1/4 of the configured CC 'expected_interval' * in order to detect CCM defect after 3.25 interval. */ static void ccm_rx_work_expired(struct work_struct *work) { struct br_cfm_peer_mep *peer_mep; struct net_bridge_port *b_port; struct delayed_work *del_work; del_work = to_delayed_work(work); peer_mep = container_of(del_work, struct br_cfm_peer_mep, ccm_rx_dwork); /* After 13 counts (4 * 3,25) then 3.25 intervals are expired */ if (peer_mep->ccm_rx_count_miss < 13) { /* 3.25 intervals are NOT expired without CCM reception */ peer_mep->ccm_rx_count_miss++; /* Start timer again */ ccm_rx_timer_start(peer_mep); } else { /* 3.25 intervals are expired without CCM reception. * CCM defect detected */ peer_mep->cc_status.ccm_defect = true; /* Change in CCM defect status - notify */ rcu_read_lock(); b_port = rcu_dereference(peer_mep->mep->b_port); if (b_port) br_cfm_notify(RTM_NEWLINK, b_port); rcu_read_unlock(); } } static u32 ccm_tlv_extract(struct sk_buff *skb, u32 index, struct br_cfm_peer_mep *peer_mep) { __be32 *s_tlv; __be32 _s_tlv; u32 h_s_tlv; u8 *e_tlv; u8 _e_tlv; e_tlv = skb_header_pointer(skb, index, sizeof(_e_tlv), &_e_tlv); if (!e_tlv) return 0; /* TLV is present - get the status TLV */ s_tlv = skb_header_pointer(skb, index, sizeof(_s_tlv), &_s_tlv); if (!s_tlv) return 0; h_s_tlv = ntohl(*s_tlv); if ((h_s_tlv >> 24) == CFM_IF_STATUS_TLV_TYPE) { /* Interface status TLV */ peer_mep->cc_status.tlv_seen = true; peer_mep->cc_status.if_tlv_value = (h_s_tlv & 0xFF); } if ((h_s_tlv >> 24) == CFM_PORT_STATUS_TLV_TYPE) { /* Port status TLV */ peer_mep->cc_status.tlv_seen = true; peer_mep->cc_status.port_tlv_value = (h_s_tlv & 0xFF); } /* The Sender ID TLV is not handled */ /* The Organization-Specific TLV is not handled */ /* Return the length of this tlv. * This is the length of the value field plus 3 bytes for size of type * field and length field */ return ((h_s_tlv >> 8) & 0xFFFF) + 3; } /* note: already called with rcu_read_lock */ static int br_cfm_frame_rx(struct net_bridge_port *port, struct sk_buff *skb) { u32 mdlevel, interval, size, index, max; const struct br_cfm_common_hdr *hdr; struct br_cfm_peer_mep *peer_mep; const struct br_cfm_maid *maid; struct br_cfm_common_hdr _hdr; struct br_cfm_maid _maid; struct br_cfm_mep *mep; struct net_bridge *br; __be32 *snumber; __be32 _snumber; __be16 *mepid; __be16 _mepid; if (port->state == BR_STATE_DISABLED) return 0; hdr = skb_header_pointer(skb, 0, sizeof(_hdr), &_hdr); if (!hdr) return 1; br = port->br; mep = br_mep_find_ifindex(br, port->dev->ifindex); if (unlikely(!mep)) /* No MEP on this port - must be forwarded */ return 0; mdlevel = hdr->mdlevel_version >> 5; if (mdlevel > mep->config.mdlevel) /* The level is above this MEP level - must be forwarded */ return 0; if ((hdr->mdlevel_version & 0x1F) != 0) { /* Invalid version */ mep->status.version_unexp_seen = true; return 1; } if (mdlevel < mep->config.mdlevel) { /* The level is below this MEP level */ mep->status.rx_level_low_seen = true; return 1; } if (hdr->opcode == BR_CFM_OPCODE_CCM) { /* CCM PDU received. */ /* MA ID is after common header + sequence number + MEP ID */ maid = skb_header_pointer(skb, CFM_CCM_PDU_MAID_OFFSET, sizeof(_maid), &_maid); if (!maid) return 1; if (memcmp(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data))) /* MA ID not as expected */ return 1; /* MEP ID is after common header + sequence number */ mepid = skb_header_pointer(skb, CFM_CCM_PDU_MEPID_OFFSET, sizeof(_mepid), &_mepid); if (!mepid) return 1; peer_mep = br_peer_mep_find(mep, (u32)ntohs(*mepid)); if (!peer_mep) return 1; /* Interval is in common header flags */ interval = hdr->flags & 0x07; if (mep->cc_config.exp_interval != pdu_to_interval(interval)) /* Interval not as expected */ return 1; /* A valid CCM frame is received */ if (peer_mep->cc_status.ccm_defect) { peer_mep->cc_status.ccm_defect = false; /* Change in CCM defect status - notify */ br_cfm_notify(RTM_NEWLINK, port); /* Start CCM RX timer */ ccm_rx_timer_start(peer_mep); } peer_mep->cc_status.seen = true; peer_mep->ccm_rx_count_miss = 0; /* RDI is in common header flags */ peer_mep->cc_status.rdi = (hdr->flags & 0x80) ? true : false; /* Sequence number is after common header */ snumber = skb_header_pointer(skb, CFM_CCM_PDU_SEQNR_OFFSET, sizeof(_snumber), &_snumber); if (!snumber) return 1; if (ntohl(*snumber) != (mep->ccm_rx_snumber + 1)) /* Unexpected sequence number */ peer_mep->cc_status.seq_unexp_seen = true; mep->ccm_rx_snumber = ntohl(*snumber); /* TLV end is after common header + sequence number + MEP ID + * MA ID + ITU reserved */ index = CFM_CCM_PDU_TLV_OFFSET; max = 0; do { /* Handle all TLVs */ size = ccm_tlv_extract(skb, index, peer_mep); index += size; max += 1; } while (size != 0 && max < 4); /* Max four TLVs possible */ return 1; } mep->status.opcode_unexp_seen = true; return 1; } static struct br_frame_type cfm_frame_type __read_mostly = { .type = cpu_to_be16(ETH_P_CFM), .frame_handler = br_cfm_frame_rx, }; int br_cfm_mep_create(struct net_bridge *br, const u32 instance, struct br_cfm_mep_create *const create, struct netlink_ext_ack *extack) { struct net_bridge_port *p; struct br_cfm_mep *mep; ASSERT_RTNL(); if (create->domain == BR_CFM_VLAN) { NL_SET_ERR_MSG_MOD(extack, "VLAN domain not supported"); return -EINVAL; } if (create->domain != BR_CFM_PORT) { NL_SET_ERR_MSG_MOD(extack, "Invalid domain value"); return -EINVAL; } if (create->direction == BR_CFM_MEP_DIRECTION_UP) { NL_SET_ERR_MSG_MOD(extack, "Up-MEP not supported"); return -EINVAL; } if (create->direction != BR_CFM_MEP_DIRECTION_DOWN) { NL_SET_ERR_MSG_MOD(extack, "Invalid direction value"); return -EINVAL; } p = br_mep_get_port(br, create->ifindex); if (!p) { NL_SET_ERR_MSG_MOD(extack, "Port is not related to bridge"); return -EINVAL; } mep = br_mep_find(br, instance); if (mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance already exists"); return -EEXIST; } /* In PORT domain only one instance can be created per port */ if (create->domain == BR_CFM_PORT) { mep = br_mep_find_ifindex(br, create->ifindex); if (mep) { NL_SET_ERR_MSG_MOD(extack, "Only one Port MEP on a port allowed"); return -EINVAL; } } mep = kzalloc(sizeof(*mep), GFP_KERNEL); if (!mep) return -ENOMEM; mep->create = *create; mep->instance = instance; rcu_assign_pointer(mep->b_port, p); INIT_HLIST_HEAD(&mep->peer_mep_list); INIT_DELAYED_WORK(&mep->ccm_tx_dwork, ccm_tx_work_expired); if (hlist_empty(&br->mep_list)) br_add_frame(br, &cfm_frame_type); hlist_add_tail_rcu(&mep->head, &br->mep_list); return 0; } static void mep_delete_implementation(struct net_bridge *br, struct br_cfm_mep *mep) { struct br_cfm_peer_mep *peer_mep; struct hlist_node *n_store; ASSERT_RTNL(); /* Empty and free peer MEP list */ hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) { cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); hlist_del_rcu(&peer_mep->head); kfree_rcu(peer_mep, rcu); } cancel_delayed_work_sync(&mep->ccm_tx_dwork); RCU_INIT_POINTER(mep->b_port, NULL); hlist_del_rcu(&mep->head); kfree_rcu(mep, rcu); if (hlist_empty(&br->mep_list)) br_del_frame(br, &cfm_frame_type); } int br_cfm_mep_delete(struct net_bridge *br, const u32 instance, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } mep_delete_implementation(br, mep); return 0; } int br_cfm_mep_config_set(struct net_bridge *br, const u32 instance, const struct br_cfm_mep_config *const config, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } mep->config = *config; return 0; } int br_cfm_cc_config_set(struct net_bridge *br, const u32 instance, const struct br_cfm_cc_config *const config, struct netlink_ext_ack *extack) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } /* Check for no change in configuration */ if (memcmp(config, &mep->cc_config, sizeof(*config)) == 0) return 0; if (config->enable && !mep->cc_config.enable) /* CC is enabled */ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) cc_peer_enable(peer_mep); if (!config->enable && mep->cc_config.enable) /* CC is disabled */ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) cc_peer_disable(peer_mep); mep->cc_config = *config; mep->ccm_rx_snumber = 0; mep->ccm_tx_snumber = 1; return 0; } int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance, u32 mepid, struct netlink_ext_ack *extack) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } peer_mep = br_peer_mep_find(mep, mepid); if (peer_mep) { NL_SET_ERR_MSG_MOD(extack, "Peer MEP-ID already exists"); return -EEXIST; } peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL); if (!peer_mep) return -ENOMEM; peer_mep->mepid = mepid; peer_mep->mep = mep; INIT_DELAYED_WORK(&peer_mep->ccm_rx_dwork, ccm_rx_work_expired); if (mep->cc_config.enable) cc_peer_enable(peer_mep); hlist_add_tail_rcu(&peer_mep->head, &mep->peer_mep_list); return 0; } int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance, u32 mepid, struct netlink_ext_ack *extack) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } peer_mep = br_peer_mep_find(mep, mepid); if (!peer_mep) { NL_SET_ERR_MSG_MOD(extack, "Peer MEP-ID does not exists"); return -ENOENT; } cc_peer_disable(peer_mep); hlist_del_rcu(&peer_mep->head); kfree_rcu(peer_mep, rcu); return 0; } int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance, const bool rdi, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } mep->rdi = rdi; return 0; } int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, const struct br_cfm_cc_ccm_tx_info *const tx_info, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } if (memcmp(tx_info, &mep->cc_ccm_tx_info, sizeof(*tx_info)) == 0) { /* No change in tx_info. */ if (mep->cc_ccm_tx_info.period == 0) /* Transmission is not enabled - just return */ return 0; /* Transmission is ongoing, the end time is recalculated */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); return 0; } if (tx_info->period == 0 && mep->cc_ccm_tx_info.period == 0) /* Some change in info and transmission is not ongoing */ goto save; if (tx_info->period != 0 && mep->cc_ccm_tx_info.period != 0) { /* Some change in info and transmission is ongoing * The end time is recalculated */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); goto save; } if (tx_info->period == 0 && mep->cc_ccm_tx_info.period != 0) { cancel_delayed_work_sync(&mep->ccm_tx_dwork); goto save; } /* Start delayed work to transmit CCM frames. It is done with zero delay * to send first frame immediately */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, 0); save: mep->cc_ccm_tx_info = *tx_info; return 0; } int br_cfm_mep_count(struct net_bridge *br, u32 *count) { struct br_cfm_mep *mep; *count = 0; rcu_read_lock(); hlist_for_each_entry_rcu(mep, &br->mep_list, head) *count += 1; rcu_read_unlock(); return 0; } int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; *count = 0; rcu_read_lock(); hlist_for_each_entry_rcu(mep, &br->mep_list, head) hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) *count += 1; rcu_read_unlock(); return 0; } bool br_cfm_created(struct net_bridge *br) { return !hlist_empty(&br->mep_list); } /* Deletes the CFM instances on a specific bridge port */ void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port) { struct hlist_node *n_store; struct br_cfm_mep *mep; ASSERT_RTNL(); hlist_for_each_entry_safe(mep, n_store, &br->mep_list, head) if (mep->create.ifindex == port->dev->ifindex) mep_delete_implementation(br, mep); }
29 29 29 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2004, Instant802 Networks, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2022 Intel Corporation */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/types.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wme.h" /* Default mapping in classifier to work with default * queue setup. */ const int ieee802_1d_to_ac[8] = { IEEE80211_AC_BE, IEEE80211_AC_BK, IEEE80211_AC_BK, IEEE80211_AC_BE, IEEE80211_AC_VI, IEEE80211_AC_VI, IEEE80211_AC_VO, IEEE80211_AC_VO }; static int wme_downgrade_ac(struct sk_buff *skb) { switch (skb->priority) { case 6: case 7: skb->priority = 5; /* VO -> VI */ return 0; case 4: case 5: skb->priority = 3; /* VI -> BE */ return 0; case 0: case 3: skb->priority = 2; /* BE -> BK */ return 0; default: return -1; } } /** * ieee80211_fix_reserved_tid - return the TID to use if this one is reserved * @tid: the assumed-reserved TID * * Returns: the alternative TID to use, or 0 on error */ static inline u8 ieee80211_fix_reserved_tid(u8 tid) { switch (tid) { case 0: return 3; case 1: return 2; case 2: return 1; case 3: return 0; case 4: return 5; case 5: return 4; case 6: return 7; case 7: return 6; } return 0; } static u16 ieee80211_downgrade_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; /* in case we are a client verify acm is not set for this ac */ while (sdata->wmm_acm & BIT(skb->priority)) { int ac = ieee802_1d_to_ac[skb->priority]; if (ifmgd->tx_tspec[ac].admitted_time && skb->priority == ifmgd->tx_tspec[ac].up) return ac; if (wme_downgrade_ac(skb)) { /* * This should not really happen. The AP has marked all * lower ACs to require admission control which is not * a reasonable configuration. Allow the frame to be * transmitted using AC_BK as a workaround. */ break; } } /* Check to see if this is a reserved TID */ if (sta && sta->reserved_tid == skb->priority) skb->priority = ieee80211_fix_reserved_tid(skb->priority); /* look up which queue to use for frames with this 1d tag */ return ieee802_1d_to_ac[skb->priority]; } /* Indicate which queue to use for this fully formed 802.11 frame */ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || local->hw.queues < IEEE80211_NUM_ACS) return 0; if (!ieee80211_is_data(hdr->frame_control)) { skb->priority = 7; return ieee802_1d_to_ac[skb->priority]; } if (!ieee80211_is_data_qos(hdr->frame_control)) { skb->priority = 0; return ieee802_1d_to_ac[skb->priority]; } p = ieee80211_get_qos_ctl(hdr); skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; return ieee80211_downgrade_queue(sdata, NULL, skb); } u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { const struct ethhdr *eth = (void *)skb->data; struct mac80211_qos_map *qos_map; bool qos; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); /* all mesh/ocb stations are required to support WME */ if ((sdata->vif.type == NL80211_IFTYPE_MESH_POINT && !is_multicast_ether_addr(eth->h_dest)) || (sdata->vif.type == NL80211_IFTYPE_OCB && sta)) qos = true; else if (sta) qos = sta->sta.wme; else qos = false; if (!qos) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } if (skb->protocol == sdata->control_port_protocol) { skb->priority = 7; goto downgrade; } /* use the data classifier to determine what 802.1d tag the * data frame has */ qos_map = rcu_dereference(sdata->qos_map); skb->priority = cfg80211_classify8021d(skb, qos_map ? &qos_map->qos_map : NULL); downgrade: return ieee80211_downgrade_queue(sdata, sta, skb); } /** * ieee80211_set_qos_hdr - Fill in the QoS header if there is one. * * @sdata: local subif * @skb: packet to be updated */ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; u8 flags; u8 *p; if (!ieee80211_is_data_qos(hdr->frame_control)) return; p = ieee80211_get_qos_ctl(hdr); /* don't overwrite the QoS field of injected frames */ if (info->flags & IEEE80211_TX_CTL_INJECTED) { /* do take into account Ack policy of injected frames */ if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; return; } /* set up the first byte */ /* * preserve everything but the TID and ACK policy * (which we both write here) */ flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK | IEEE80211_QOS_CTL_ACK_POLICY_MASK); if (is_multicast_ether_addr(hdr->addr1) || sdata->noack_map & BIT(tid)) { flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK; info->flags |= IEEE80211_TX_CTL_NO_ACK; } *p = flags | tid; /* set up the second byte */ p++; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* preserve RSPI and Mesh PS Level bit */ *p &= ((IEEE80211_QOS_CTL_RSPI | IEEE80211_QOS_CTL_MESH_PS_LEVEL) >> 8); /* Nulls don't have a mesh header (frame body) */ if (!ieee80211_is_qos_nullfunc(hdr->frame_control)) *p |= (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8); } else { *p = 0; } }
26 26 5 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 // SPDX-License-Identifier: GPL-2.0 /* * High-level sync()-related operations */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/writeback.h> #include <linux/syscalls.h> #include <linux/linkage.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/backing-dev.h> #include "internal.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) /* * Write out and wait upon all dirty data associated with this * superblock. Filesystem data as well as the underlying block * device. Takes the superblock lock. */ int sync_filesystem(struct super_block *sb) { int ret = 0; /* * We need to be protected against the filesystem going from * r/o to r/w or vice versa. */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); /* * No point in syncing out anything if the filesystem is read-only. */ if (sb_rdonly(sb)) return 0; /* * Do the filesystem syncing work. For simple filesystems * writeback_inodes_sb(sb) just dirties buffers with inodes so we have * to submit I/O for these buffers via sync_blockdev(). This also * speeds up the wait == 1 case since in that case write_inode() * methods call sync_dirty_buffer() and thus effectively write one block * at a time. */ writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) { ret = sb->s_op->sync_fs(sb, 0); if (ret) return ret; } ret = sync_blockdev_nowait(sb->s_bdev); if (ret) return ret; sync_inodes_sb(sb); if (sb->s_op->sync_fs) { ret = sb->s_op->sync_fs(sb, 1); if (ret) return ret; } return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); static void sync_inodes_one_sb(struct super_block *sb, void *arg) { if (!sb_rdonly(sb)) sync_inodes_sb(sb); } static void sync_fs_one_sb(struct super_block *sb, void *arg) { if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) && sb->s_op->sync_fs) sb->s_op->sync_fs(sb, *(int *)arg); } /* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably * which effectively also waits for all flusher threads to finish doing * writeback. At this point all data is on disk so metadata should be stable * and we tell filesystems to sync their metadata via ->sync_fs() calls. * Finally, we writeout all block devices because some filesystems (e.g. ext2) * just write metadata (such as inodes or bitmaps) to block device page cache * and do not sync it on their own in ->sync_fs(). */ void ksys_sync(void) { int nowait = 0, wait = 1; wakeup_flusher_threads(WB_REASON_SYNC); iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); sync_bdevs(true); if (unlikely(laptop_mode)) laptop_sync_completion(); } SYSCALL_DEFINE0(sync) { ksys_sync(); return 0; } static void do_sync_work(struct work_struct *work) { int nowait = 0; int wait = 1; /* * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); sync_bdevs(false); iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); } void emergency_sync(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_sync_work); schedule_work(work); } } /* * sync a single super */ SYSCALL_DEFINE1(syncfs, int, fd) { CLASS(fd, f)(fd); struct super_block *sb; int ret, ret2; if (fd_empty(f)) return -EBADF; sb = fd_file(f)->f_path.dentry->d_sb; down_read(&sb->s_umount); ret = sync_filesystem(sb); up_read(&sb->s_umount); ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err); return ret ? ret : ret2; } /** * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync * @start: offset in bytes of the beginning of data range to sync * @end: offset in bytes of the end of data range (inclusive) * @datasync: perform only datasync * * Write back data in range @start..@end and metadata for @file to disk. If * @datasync is set only metadata needed to access modified file data is * written. */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; if (!file->f_op->fsync) return -EINVAL; if (!datasync && (inode_state_read_once(inode) & I_DIRTY_TIME)) mark_inode_dirty_sync(inode); return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); /** * vfs_fsync - perform a fsync or fdatasync on a file * @file: file to sync * @datasync: only perform a fdatasync operation * * Write back data and metadata for @file to disk. If @datasync is * set only metadata needed to access modified file data is written. */ int vfs_fsync(struct file *file, int datasync) { return vfs_fsync_range(file, 0, LLONG_MAX, datasync); } EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fsync(fd_file(f), datasync); } SYSCALL_DEFINE1(fsync, unsigned int, fd) { return do_fsync(fd, 0); } SYSCALL_DEFINE1(fdatasync, unsigned int, fd) { return do_fsync(fd, 1); } int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags) { int ret; struct address_space *mapping; loff_t endbyte; /* inclusive */ umode_t i_mode; ret = -EINVAL; if (flags & ~VALID_FLAGS) goto out; endbyte = offset + nbytes; if ((s64)offset < 0) goto out; if ((s64)endbyte < 0) goto out; if (endbyte < offset) goto out; if (sizeof(pgoff_t) == 4) { if (offset >= (0x100000000ULL << PAGE_SHIFT)) { /* * The range starts outside a 32 bit machine's * pagecache addressing capabilities. Let it "succeed" */ ret = 0; goto out; } if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) { /* * Out to EOF */ nbytes = 0; } } if (nbytes == 0) endbyte = LLONG_MAX; else endbyte--; /* inclusive */ i_mode = file_inode(file)->i_mode; ret = -ESPIPE; if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && !S_ISLNK(i_mode)) goto out; mapping = file->f_mapping; ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = file_fdatawait_range(file, offset, endbyte); if (ret < 0) goto out; } if (flags & SYNC_FILE_RANGE_WRITE) { if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == SYNC_FILE_RANGE_WRITE_AND_WAIT) ret = filemap_fdatawrite_range(mapping, offset, endbyte); else ret = filemap_flush_range(mapping, offset, endbyte); if (ret < 0) goto out; } if (flags & SYNC_FILE_RANGE_WAIT_AFTER) ret = file_fdatawait_range(file, offset, endbyte); out: return ret; } /* * ksys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is * zero then ksys_sync_file_range() will operate from offset out to EOF. * * The flag bits are: * * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range * before performing the write. * * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the * range which are not presently under writeback. Note that this may block for * significant periods due to exhaustion of disk request structures. * * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range * after performing the write. * * Useful combinations of the flag bits are: * * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages * in the range which were dirty on entry to ksys_sync_file_range() are placed * under writeout. This is a start-write-for-data-integrity operation. * * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which * are not presently under writeout. This is an asynchronous flush-to-disk * operation. Not suitable for data integrity operations. * * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for * completion of writeout of all pages in the range. This will be used after an * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait * for that operation to complete and to return the result. * * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT): * a traditional sync() operation. This is a write-for-data-integrity operation * which will ensure that all pages in the range which were dirty on entry to * ksys_sync_file_range() are written to disk. It should be noted that disk * caches are not flushed by this call, so there are no guarantees here that the * data will be available on disk after a crash. * * * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any * I/O errors or ENOSPC conditions and will return those to the caller, after * clearing the EIO and ENOSPC flags in the address_space. * * It should be noted that none of these operations write out the file's * metadata. So unless the application is strictly performing overwrites of * already-instantiated disk blocks, there are no guarantees here that the data * will be available after a crash. */ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, unsigned int flags) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return sync_file_range(fd_file(f), offset, nbytes, flags); } SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, unsigned int, flags) { return ksys_sync_file_range(fd, offset, nbytes, flags); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE) COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset), compat_arg_u64_dual(nbytes), unsigned int, flags) { return ksys_sync_file_range(fd, compat_arg_u64_glue(offset), compat_arg_u64_glue(nbytes), flags); } #endif /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, loff_t, offset, loff_t, nbytes) { return ksys_sync_file_range(fd, offset, nbytes, flags); }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BIO_INTEGRITY_H #define _LINUX_BIO_INTEGRITY_H #include <linux/bio.h> enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; struct bio_integrity_payload { struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ struct bio_vec *bip_vec; }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) #define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { if (bio->bi_opf & REQ_INTEGRITY) return bio->bi_integrity; return NULL; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip) return bip->bip_flags & flag; return false; } static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { return bip->bip_iter.bi_sector; } static inline void bip_set_seed(struct bio_integrity_payload *bip, sector_t seed) { bip->bip_iter.bi_sector = seed; } void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { return NULL; } static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) { return -EINVAL; } static inline void bio_integrity_unmap_user(struct bio *bio) { } static inline bool bio_integrity_prep(struct bio *bio) { return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { return 0; } static inline void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { } static inline void bio_integrity_trim(struct bio *bio) { } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; } static inline struct bio_integrity_payload * bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } static inline int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); #endif /* _LINUX_BIO_INTEGRITY_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* SPDX-License-Identifier: GPL-2.0 */ /* * x86 TSC related functions */ #ifndef _ASM_X86_TSC_H #define _ASM_X86_TSC_H #include <asm/asm.h> #include <asm/cpufeature.h> #include <asm/processor.h> #include <asm/msr.h> /** * rdtsc() - returns the current TSC without ordering constraints * * rdtsc() returns the result of RDTSC as a 64-bit integer. The * only ordering constraint it supplies is the ordering implied by * "asm volatile": it will put the RDTSC in the place you expect. The * CPU can and will speculatively execute that RDTSC, though, so the * results can be non-monotonic if compared on different CPUs. */ static __always_inline u64 rdtsc(void) { EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); return EAX_EDX_VAL(val, low, high); } /** * rdtsc_ordered() - read the current TSC in program order * * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. * It is ordered like a load to a global in-memory counter. It should * be impossible to observe non-monotonic rdtsc_unordered() behavior * across multiple CPUs as long as the TSC is synced. */ static __always_inline u64 rdtsc_ordered(void) { EAX_EDX_DECLARE_ARGS(val, low, high); /* * The RDTSC instruction is not ordered relative to memory * access. The Intel SDM and the AMD APM are both vague on this * point, but empirically an RDTSC instruction can be * speculatively executed before prior loads. An RDTSC * immediately after an appropriate barrier appears to be * ordered as a normal load, that is, it provides the same * ordering guarantees as reading from a global memory location * that some other imaginary CPU is updating continuously with a * time stamp. * * Thus, use the preferred barrier on the respective CPU, aiming for * RDTSCP as the default. */ asm volatile(ALTERNATIVE_2("rdtsc", "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, "rdtscp", X86_FEATURE_RDTSCP) : EAX_EDX_RET(val, low, high) /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ :: "ecx"); return EAX_EDX_VAL(val, low, high); } /* * Standard way to access the cycle counter. */ typedef unsigned long long cycles_t; extern unsigned int cpu_khz; extern unsigned int tsc_khz; extern void disable_TSC(void); static inline cycles_t get_cycles(void) { if (!IS_ENABLED(CONFIG_X86_TSC) && !cpu_feature_enabled(X86_FEATURE_TSC)) return 0; return rdtsc(); } #define get_cycles get_cycles extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern void mark_tsc_async_resets(char *reason); extern unsigned long native_calibrate_cpu_early(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; #ifdef CONFIG_X86_TSC extern bool tsc_async_resets; #else # define tsc_async_resets false #endif /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: */ #ifdef CONFIG_X86_TSC extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_target(void); #else static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } static inline void tsc_verify_tsc_adjust(bool resume) { } static inline void check_tsc_sync_target(void) { } #endif extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); unsigned long cpu_khz_from_msr(void); #endif /* _ASM_X86_TSC_H */
116 116 3 3 86 85 128 127 129 128 74 74 52 52 1 1 4 4 251 244 7 83 278 278 103 102 11 3 1 2 5 4 1 1 3 6 6 6 6 6 3 3 3 3 6 2 1 3 6 1 2 3 7 5 2 7 2 6 1 6 6 6 3 1 2 2 2 2 6 6 5 3 2 4 1 3 1 9 9 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 /* * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "node.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "monitor.h" #include "discover.h" #include "netlink.h" #include "trace.h" #include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; struct tipc_link_entry { struct tipc_link *link; spinlock_t lock; /* per link */ u32 mtu; struct sk_buff_head inputq; struct tipc_media_addr maddr; }; struct tipc_bclink_entry { struct tipc_link *link; struct sk_buff_head inputq1; struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; u16 named_rcv_nxt; bool named_open; }; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @peer_id: 128-bit ID of peer * @peer_id_string: ID string of peer * @publ_list: list of publications * @conn_sks: list of connections (FIXME) * @timer: node's keepalive timer * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node * @peer_net: peer's net namespace * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; struct kref kref; rwlock_t lock; struct net *net; struct hlist_node hash; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; struct tipc_bclink_entry bc_entry; int action_flags; struct list_head list; int state; bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; u32 signature; u32 link_id; u8 peer_id[16]; char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; struct net *peer_net; u32 peer_hash_mix; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *crypto_rx; #endif }; /* Node FSM states and events: */ enum { SELF_DOWN_PEER_DOWN = 0xdd, SELF_UP_PEER_UP = 0xaa, SELF_DOWN_PEER_LEAVING = 0xd1, SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, NODE_FAILINGOVER = 0xf0, NODE_SYNCHING = 0xcc }; enum { SELF_ESTABL_CONTACT_EVT = 0xece, SELF_LOST_CONTACT_EVT = 0x1ce, PEER_ESTABL_CONTACT_EVT = 0x9ece, PEER_LOST_CONTACT_EVT = 0x91ce, NODE_FAILOVER_BEGIN_EVT = 0xfbe, NODE_FAILOVER_END_EVT = 0xfee, NODE_SYNCH_BEGIN_EVT = 0xcbe, NODE_SYNCH_END_EVT = 0xcee }; static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr); static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; u32 peer_port; u32 peer_node; struct list_head list; }; static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) return NULL; return n->links[bearer_id].link; } int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; /* Allow MAX_MSG_SIZE when building connection oriented message * if they are in the same core network */ if (n->peer_net && connected) { tipc_node_put(n); return mtu; } bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) { u8 *own_id = tipc_own_id(net); struct tipc_node *n; if (!own_id) return true; if (addr == tipc_own_addr(net)) { memcpy(id, own_id, TIPC_NODEID_LEN); return true; } n = tipc_node_find(net, addr); if (!n) return false; memcpy(id, &n->peer_id, TIPC_NODEID_LEN); tipc_node_put(n); return true; } u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; u16 caps; n = tipc_node_find(net, addr); if (unlikely(!n)) return TIPC_NODE_CAPABILITIES; caps = n->capabilities; tipc_node_put(n); return caps; } u32 tipc_node_get_addr(struct tipc_node *node) { return (node) ? node->addr : 0; } char *tipc_node_get_id_str(struct tipc_node *node) { return node->peer_id_string; } #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) { return (__n) ? __n->crypto_rx : NULL; } struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) { struct tipc_node *n; n = tipc_node_find(net, addr); return (n) ? n->crypto_rx : NULL; } #endif static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&n->crypto_rx); #endif kfree(n); } static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); call_rcu(&n->rcu, tipc_node_free); } void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } /* * tipc_node_find - locate specified node object, if it exists */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; break; } rcu_read_unlock(); return node; } /* tipc_node_find_by_id - locate specified node object by its 128-bit id * Note: this function is called only when a discovery request failed * to find the node by its 32-bit id, and is not time critical */ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool found = false; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { read_lock_bh(&n->lock); if (!memcmp(id, n->peer_id, 16) && kref_get_unless_zero(&n->kref)) found = true; read_unlock_bh(&n->lock); if (found) break; } rcu_read_unlock(); return found ? n : NULL; } static void tipc_node_read_lock(struct tipc_node *n) __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) __releases(n->lock) { struct tipc_socket_addr sk; struct net *net = n->net; u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; sk.node = tipc_own_addr(net); node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) { int net_id = tipc_netid(n->net); struct tipc_net *tn_peer; struct net *tmp; u32 hash_chk; if (n->peer_net) return; for_each_net_rcu(tmp) { tn_peer = tipc_net(tmp); if (!tn_peer) continue; /* Integrity checking whether node exists in namespace or not */ if (tn_peer->net_id != net_id) continue; if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) continue; hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); if (hash_mixes ^ hash_chk) continue; n->peer_net = tmp; n->peer_hash_mix = hash_mixes; break; } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr) ?: tipc_node_find_by_id(net, peer_id); if (n) { if (!n->preliminary) goto update; if (preliminary) goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link refresh failed, no memory\n"); tipc_node_write_unlock_fast(n); tipc_node_put(n); n = NULL; goto exit; } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_del_rcu(&n->list); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); tipc_node_write_unlock_fast(n); update: if (n->peer_hash_mix ^ hash_mixes) tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } tipc_node_write_unlock_fast(n); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } tipc_nodeid2string(n->peer_id_string, peer_id); #ifdef CONFIG_TIPC_CRYPTO if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); kfree(n); n = NULL; goto exit; } #endif n->addr = addr; n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; n->peer_net = NULL; n->peer_hash_mix = 0; /* Assign kernel local namespace if exists */ tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); INIT_HLIST_NODE(&n->hash); INIT_LIST_HEAD(&n->list); INIT_LIST_HEAD(&n->publ_list); INIT_LIST_HEAD(&n->conn_sks); skb_queue_head_init(&n->bc_entry.namedq); skb_queue_head_init(&n->bc_entry.inputq1); __skb_queue_head_init(&n->bc_entry.arrvq); skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; if (!preliminary && !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); tipc_node_put(n); n = NULL; goto exit; } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ n->keepalive_intv = 10000; intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; /* Link with lowest tolerance determines timer interval */ if (intv < n->keepalive_intv) n->keepalive_intv = intv; /* Ensure link's abort limit corresponds to current tolerance */ tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete_from_list(struct tipc_node *node) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_key_flush(node->crypto_rx); #endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); } static void tipc_node_delete(struct tipc_node *node) { trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); timer_delete_sync(&node->timer); tipc_node_put(node); } void tipc_node_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); tipc_node_write_unlock_fast(n); tipc_node_put(n); } void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_del_init(subscr); tipc_node_write_unlock_fast(n); tipc_node_put(n); } int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; int err = 0; if (in_own_node(net, dnode)) return 0; node = tipc_node_find(net, dnode); if (!node) { pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) { struct tipc_node *node; struct tipc_sock_conn *conn, *safe; if (in_own_node(net, dnode)) return; node = tipc_node_find(net, dnode); if (!node) return; tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } tipc_node_write_unlock(node); tipc_node_put(node); } static void tipc_node_clear_links(struct tipc_node *node) { int i; for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link_entry *le = &node->links[i]; if (le->link) { kfree(le->link); le->link = NULL; node->link_cnt--; } } } /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; /* If lock held by tipc_node_stop() the node will be deleted anyway */ if (!spin_trylock_bh(&tn->node_list_lock)) return false; tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { tipc_node_clear_links(peer); tipc_node_delete_from_list(peer); deleted = true; } tipc_node_write_unlock(peer); if (!deleted) { spin_unlock_bh(&tn->node_list_lock); return deleted; } /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(peer->net, (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) { struct tipc_node *n = timer_container_of(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; int bearer_id; int rc = 0; trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); return; } #ifdef CONFIG_TIPC_CRYPTO /* Take any crypto key related actions first */ tipc_crypto_timeout(n->crypto_rx); #endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be * recalculated with link lowest tolerance */ tipc_node_read_lock(n); n->keepalive_intv = 10000; tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; if (le->link) { spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); spin_unlock_bh(&le->lock); remains--; } tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** * __tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); if (!tipc_link_is_up(nl)) return; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } /* Second link => redistribute slots */ if (tipc_link_prio(nl) > tipc_link_prio(ol)) { pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_media_addr *maddr; tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } /** * tipc_node_link_failover() - start failover in case "half-failover" * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. * This can happen when e.g.:: * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> * * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) * @tnl: tunnel link * @xmitq: queue for messages to be xmited on tnl link later */ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { /* Avoid to be "self-failover" that can never end */ if (!tipc_link_is_up(tnl)) return; /* Don't rush, failure link may be in the process of resetting */ if (l && !tipc_link_is_reset(l)) return; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_failover_prepare(l, tnl, xmitq); if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); } /** * __tipc_node_link_down - handle loss of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr) { struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; *slot1 = INVALID_BEARER_ID; for (i = 0; i < MAX_BEARERS; i++) { _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; if (_l == l) continue; prio = tipc_link_prio(_l); if (prio < highest) continue; if (prio > highest) { highest = prio; *slot0 = i; *slot1 = i; continue; } *slot1 = i; } if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); return; } tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ *bearer_id = n->active_links[0]; tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; int old_bearer_id = bearer_id; struct sk_buff_head xmitq; if (!l) return; __skb_queue_head_init(&xmitq); tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); } else { /* Defuse pending tipc_node_link_up() */ tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } if (delete) { kfree(l); le->link = NULL; n->link_cnt--; } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } bool tipc_node_is_up(struct net *net, u32 addr) { struct tipc_node *n; bool retval = false; if (in_own_node(net, addr)) return true; n = tipc_node_find(net, addr); if (!n) return false; retval = node_is_up(n); tipc_node_put(n); return retval; } static u32 tipc_node_suggest_addr(struct net *net, u32 addr) { struct tipc_node *n; addr ^= tipc_net(net)->random; while ((n = tipc_node_find(net, addr))) { tipc_node_put(n); addr++; } return addr; } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool preliminary; u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); if (n) { if (!memcmp(n->peer_id, id, NODE_ID_LEN)) addr = 0; tipc_node_put(n); if (!addr) return 0; return tipc_node_suggest_addr(net, addr); } /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { sugg_addr = n->addr; preliminary = n->preliminary; tipc_node_put(n); if (!preliminary) return sugg_addr; } /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); return 0; } void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool link_is_reset = false; bool accept_addr = false; bool reset = false; char *if_name; unsigned long intv; u16 session; *dupl_addr = false; *respond = false; n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, false); if (!n) return; tipc_node_write_lock(n); le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; } else if (sign_match && !addr_match && link_up) { /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. */ *dupl_addr = true; } else if (sign_match && !addr_match && !link_up) { /* Peer link has changed i/f address without rebooting. * It may also be a cloned or malicious peer; we can't * distinguish between the two. * The signature is correct, so we must accept. */ accept_addr = true; *respond = true; reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already * reset and re-established contact with the peer, before * receiving a discovery message from that node. * (The peer happened to receive one from this node first). * - The peer came back so fast that our side has not * discovered it yet. Probing from this side will soon * reset the link, since there can be no working link * endpoint at the peer end, and the link will re-establish. * Accept the signature, since it comes from a known peer. */ n->signature = signature; } else if (!sign_match && addr_match && !link_up) { /* The peer node has rebooted. * Accept signature, since it is a known peer. */ n->signature = signature; *respond = true; } else if (!sign_match && !addr_match && link_up) { /* Peer rebooted with new address, or a new/duplicate peer. * Ignore until the link goes down, if ever. */ *dupl_addr = true; } else if (!sign_match && !addr_match && !link_up) { /* Peer rebooted with new address, or it is a new peer. * Accept signature and address. */ n->signature = signature; accept_addr = true; *respond = true; reset = true; } if (!accept_addr) goto exit; /* Now create new link if not already existing */ if (!l) { if (n->link_cnt == 2) goto exit; if_name = strchr(b->name, ':') + 1; get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, &n->bc_entry.namedq, &l)) { *respond = false; goto exit; } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) { intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } static void tipc_node_reset_links(struct tipc_node *n) { int i; pr_warn("Resetting all links to %x\n", n->addr); trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } } /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; switch (state) { case SELF_DOWN_PEER_DOWN: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_COMING; break; case PEER_ESTABL_CONTACT_EVT: state = SELF_COMING_PEER_UP; break; case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_UP: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_BEGIN_EVT: state = NODE_SYNCHING; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_END_EVT: break; default: goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: switch (evt) { case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_COMING: switch (evt) { case PEER_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_BEGIN_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_COMING_PEER_UP: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case NODE_FAILINGOVER: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_FAILOVER_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_SYNCH_END_EVT: default: goto illegal_evt; } break; case NODE_SYNCHING: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case NODE_SYNCH_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; n->peer_net = NULL; n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } } /** * tipc_node_get_linkname - get the name of a link * * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer * @len: size of @linkname output buffer * * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); if (!node) return err; if (bearer_id >= MAX_BEARERS) goto exit; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { strscpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); exit: tipc_node_put(node); return err; } /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NODE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head inputq; switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (msg_connected(hdr) || msg_named(hdr) || msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; } if (msg_mcast(hdr)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); return; } return; case MSG_FRAGMENTER: if (tipc_msg_assemble(list)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); } return; case GROUP_PROTOCOL: case CONN_MANAGER: tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; case LINK_PROTOCOL: case NAME_DISTRIBUTOR: case TUNNEL_PROTOCOL: case BCAST_PROTOCOL: return; default: return; } } /** * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; struct net *peer_net; int bearer_id; int rc; if (in_own_node(net, dnode)) { tipc_loopback_trace(net, list); spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { __skb_queue_purge(list); return -EHOSTUNREACH; } rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); peer_net = n->peer_net; tipc_node_read_unlock(n); if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { rcu_read_unlock(); tipc_node_put(n); return 0; } } rcu_read_unlock(); tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); __skb_queue_purge(list); return -EHOSTUNREACH; } __skb_queue_head_init(&xmitq); le = &n->links[bearer_id]; spin_lock_bh(&le->lock); rc = tipc_link_xmit(le->link, list, &xmitq); spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); return rc; } /* tipc_node_xmit_skb(): send single buffer to destination * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; } /* tipc_node_distr_xmit(): send single buffer msgs to individual destinations * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected */ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) { struct sk_buff *skb; u32 selector, dnode; while ((skb = __skb_dequeue(xmitq))) { selector = msg_origport(buf_msg(skb)); dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, selector); } return 0; } void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; u16 dummy; u32 dst; /* Use broadcast if all nodes support it */ if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { __skb_queue_head_init(&xmitq); __skb_queue_tail(&xmitq, skb); tipc_bcast_xmit(net, &xmitq, &dummy); return; } /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; if (in_own_node(net, dst)) continue; if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) break; msg_set_destnode(buf_msg(txskb), dst); tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); kfree_skb(skb); } static void tipc_node_mcast_rcv(struct tipc_node *n) { struct tipc_bclink_entry *be = &n->bc_entry; /* 'arrvq' is under inputq2's lock protection */ spin_lock_bh(&be->inputq2.lock); spin_lock_bh(&be->inputq1.lock); skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); spin_unlock_bh(&be->inputq1.lock); spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); } static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_link *ucl; int rc; rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); return; } if (!(rc & TIPC_LINK_SND_STATE)) return; /* If probe message, a STATE response will be sent anyway */ if (msg_probe(hdr)) return; /* Produce a STATE message carrying broadcast NACK */ tipc_node_read_lock(n); ucl = n->links[bearer_id].link; if (ucl) tipc_link_build_state_msg(ucl, xmitq); tipc_node_read_unlock(n); } /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @bearer_id: id of bearer message arrived on * * Invoked with no locks held. */ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) { int rc; struct sk_buff_head xmitq; struct tipc_bclink_entry *be; struct tipc_link_entry *le; struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); u32 dnode = msg_destnode(hdr); struct tipc_node *n; __skb_queue_head_init(&xmitq); /* If NACK for other node, let rcv link for that node peek into it */ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) n = tipc_node_find(net, dnode); else n = tipc_node_find(net, msg_prevnode(hdr)); if (!n) { kfree_skb(skb); return; } be = &n->bc_entry; le = &n->links[bearer_id]; rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); tipc_node_put(n); } /** * tipc_node_check_state - check and if necessary update node state * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet * @xmitq: queue for messages to be xmited on * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int pb_id; if (trace_tipc_node_check_state_enabled()) { trace_tipc_skb_dump(skb, false, "skb for node state check"); trace_tipc_node_check_state(n, true, " "); } l = n->links[bearer_id].link; if (!l) return false; rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { if ((pb_id != bearer_id) && n->links[pb_id].link) { pl = n->links[pb_id].link; break; } } if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; if (!msg_peer_link_is_up(hdr)) return true; tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); } if (state == SELF_DOWN_PEER_LEAVING) { if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); return true; } if (state == SELF_LEAVING_PEER_DOWN) return false; /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } /* If parallel link was already down, and this happened before * the tunnel link came up, node failover was never started. * Ensure that a FAILOVER_MSG is sent to get peer out of * NODE_FAILINGOVER state, also this node must accept * TUNNEL_MSGs from peer. */ if (n->state != NODE_FAILINGOVER) tipc_node_link_failover(n, pl, l, xmitq); /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { if (n->capabilities & TIPC_TUNNEL_ENHANCED) syncpt = msg_syncpt(hdr); else syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } } /* Open tunnel link when parallel link reaches synch point */ if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } inputq_len = skb_queue_len(tipc_link_inputq(pl)); dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } if (l == pl) return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) return true; return false; } return true; } /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_link_entry *le; struct tipc_msg *hdr; struct tipc_node *n; int bearer_id = b->identity; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; #ifdef CONFIG_TIPC_CRYPTO struct tipc_ehdr *ehdr; /* Check if message must be decrypted first */ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) goto rcv; ehdr = (struct tipc_ehdr *)skb->data; if (likely(ehdr->user != LINK_CONFIG)) { n = tipc_node_find(net, ntohl(ehdr->addr)); if (unlikely(!n)) goto discard; } else { n = tipc_node_find_by_id(net, ehdr->id); } skb_dst_force(skb); tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; rcv: #endif /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(&skb))) goto discard; __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { if (unlikely(usr == LINK_CONFIG)) return tipc_disc_rcv(net, skb, b); else return tipc_node_bc_rcv(net, skb, bearer_id); } /* Discard unicast link messages destined for another node */ if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) goto discard; /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) { if (unlikely(skb_linearize(skb))) { tipc_node_put(n); goto discard; } hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { spin_lock_bh(&le->lock); if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } spin_unlock_bh(&le->lock); } tipc_node_read_unlock(n); /* Check/update node state before receiving */ if (unlikely(skb)) { if (unlikely(skb_linearize(skb))) goto out_node_put; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } } tipc_node_write_unlock(n); } if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); out_node_put: tipc_node_put(n); discard: kfree_skb(skb); } void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; struct sk_buff_head xmitq; struct tipc_link_entry *e; struct tipc_node *n; __skb_queue_head_init(&xmitq); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; if (e->link) { if (prop == TIPC_NLA_PROP_TOL) tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); /* Update MTU for node link entry */ e->mtu = tipc_link_mss(e->link); } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); } int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are * mutually exclusive cases */ if (attrs[TIPC_NLA_NET_ADDR]) { addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; } if (attrs[TIPC_NLA_NET_NODEID]) { if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); addr = hash128to32(node_id); } if (in_own_node(net, addr)) return -ENOTSUPP; spin_lock_bh(&tn->node_list_lock); peer = tipc_node_find(net, addr); if (!peer) { spin_unlock_bh(&tn->node_list_lock); return -ENXIO; } tipc_node_write_lock(peer); if (peer->state != SELF_DOWN_PEER_DOWN && peer->state != SELF_DOWN_PEER_LEAVING) { tipc_node_write_unlock(peer); err = -EBUSY; goto err_out; } tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); spin_unlock_bh(&tn->node_list_lock); return err; } int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); int done = cb->args[0]; int last_addr = cb->args[1]; struct tipc_node *node; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (last_addr) { node = tipc_node_find(net, last_addr); if (!node) { rcu_read_unlock(); /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the NLMSG_DONE message having * the NLM_F_DUMP_INTR flag set if the node state * changed while we released the lock. */ cb->prev_seq = 1; return -EPIPE; } tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { if (node->preliminary) continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; else continue; } tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; tipc_node_read_unlock(node); goto out; } tipc_node_read_unlock(node); } done = 1; out: cb->args[0] = done; cb->args[1] = last_addr; rcu_read_unlock(); return skb->len; } /* tipc_node_find_by_name - locate owner node of link by link's name * @net: the applicable net namespace * @name: pointer to link name string * @bearer_id: pointer to index in 'node->links' array where the link was found. * * Returns pointer to node owning the link, or 0 if no matching link is found. */ static struct tipc_node *tipc_node_find_by_name(struct net *net, const char *link_name, unsigned int *bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l; struct tipc_node *n; struct tipc_node *found_node = NULL; int i; *bearer_id = 0; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_read_lock(n); for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l && !strcmp(tipc_link_name(l), link_name)) { *bearer_id = i; found_node = n; break; } } tipc_node_read_unlock(n); if (found_node) break; } rcu_read_unlock(); return found_node; } int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) { int err; int res = 0; int bearer_id; char *name; struct tipc_link *link; struct tipc_node *node; struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); __skb_queue_head_init(&xmitq); if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); if (strcmp(name, tipc_bclink_name) == 0) return tipc_nl_bc_link_set(net, attrs); node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; } if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); tipc_link_set_queue_limits(link, tipc_link_min_win(link), max_win); } } out: tipc_node_read_unlock(node); tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, NULL); return res; } int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct tipc_nl_msg msg; char *name; int err; msg.portid = info->snd_portid; msg.seq = info->snd_seq; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) { err = -EINVAL; goto err_free; } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); err = -EINVAL; goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); if (err) goto err_free; } return genlmsg_reply(msg.skb, info); err_free: nlmsg_free(msg.skb); return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) { int err; char *link_name; unsigned int bearer_id; struct tipc_link *link; struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); err = -EINVAL; if (!strcmp(link_name, tipc_bclink_name)) { err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; } else if (strstr(link_name, tipc_bclink_name)) { rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); link = node->bc_entry.link; if (link && !strcmp(link_name, tipc_link_name(link))) { err = tipc_bclink_reset_stats(net, link); tipc_node_read_unlock(node); break; } tipc_node_read_unlock(node); } rcu_read_unlock(); return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); if (!node) return -EINVAL; le = &node->links[bearer_id]; tipc_node_read_lock(node); spin_lock_bh(&le->lock); link = node->links[bearer_id].link; if (!link) { spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return -EINVAL; } tipc_link_reset_stats(link); spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return 0; } /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, struct tipc_node *node, u32 *prev_link, bool bc_link) { u32 i; int err; for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; if (!node->links[i].link) continue; err = __tipc_nl_add_link(net, msg, node->links[i].link, NLM_F_MULTI); if (err) return err; } if (bc_link) { *prev_link = i; err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); if (err) return err; } *prev_link = 0; return 0; } int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; bool bc_link = cb->args[3]; int err; if (done) return 0; if (!prev_node) { /* Check if broadcast-receiver links dumping is needed */ if (attrs && attrs[TIPC_NLA_LINK]) { err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], tipc_nl_link_policy, NULL); if (unlikely(err)) return err; if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) return -EINVAL; bc_link = true; } } msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; goto out; } tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } else { err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } done = 1; out: rcu_read_unlock(); cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; cb->args[3] = bc_link; return skb->len; } int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; struct net *net = sock_net(skb->sk); int err; if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, info->extack); if (err) return err; if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); err = tipc_nl_monitor_set_threshold(net, val); if (err) return err; } return 0; } static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) { struct nlattr *attrs; void *hdr; u32 val; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; val = tipc_nl_monitor_get_threshold(net); if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_add_monitor_prop(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_bearer = cb->args[0]; struct tipc_nl_msg msg; int bearer_id; int err; if (prev_bearer == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } rtnl_unlock(); cb->args[0] = bearer_id; return skb->len; } int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_node = cb->args[1]; u32 bearer_id = cb->args[2]; int done = cb->args[0]; struct tipc_nl_msg msg; int err; if (!prev_node) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, NULL); if (err) return err; if (!mon[TIPC_NLA_MON_REF]) return -EINVAL; bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); if (bearer_id >= MAX_BEARERS) return -EINVAL; } if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); if (!err) done = 1; rtnl_unlock(); cb->args[0] = done; cb->args[1] = prev_node; cb->args[2] = bearer_id; return skb->len; } #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; struct tipc_aead_key *key; if (!attr) return -ENODATA; if (nla_len(attr) < sizeof(*key)) return -EINVAL; key = (struct tipc_aead_key *)nla_data(attr); if (key->keylen > TIPC_AEAD_KEYLEN_MAX || nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; *pkey = key; return 0; } static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) { struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; if (!attr) return -ENODATA; if (nla_len(attr) < TIPC_NODEID_LEN) return -EINVAL; *node_id = (u8 *)nla_data(attr); return 0; } static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) { struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; if (!attr) return -ENODATA; *intv = nla_get_u32(attr); return 0; } static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; bool rekeying = true, master_key = false; u8 *id, *own_id, mode; u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) return -EINVAL; rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) return rc; own_id = tipc_own_id(net); if (!own_id) { GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); return -EPERM; } rc = tipc_nl_retrieve_rekeying(attrs, &intv); if (rc == -ENODATA) rekeying = false; rc = tipc_nl_retrieve_key(attrs, &ukey); if (rc == -ENODATA && rekeying) goto rekeying; else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); if (rc) return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: mode = CLUSTER_KEY; master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); if (unlikely(!n)) return -ENOMEM; c = n->crypto_rx; } break; default: return rc; } /* Initiate the TX/RX key */ rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } else if (c == tx) { /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); rekeying: /* Schedule TX rekeying if needed */ tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_set_key(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_node *n; tipc_crypto_key_flush(tn->crypto_tx); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); return 0; } int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_flush_key(skb, info); rtnl_unlock(); return err; } #endif /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped * @more: dump more? * - false: dump only tipc node data * - true: dump node link data as well * @buf: returned buffer of dump data in format */ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) { int i = 0; size_t sz = (more) ? NODE_LMAX : NODE_LMIN; if (!n) { i += scnprintf(buf, sz, "node data: (null)\n"); return i; } i += scnprintf(buf, sz, "node data: %x", n->addr); i += scnprintf(buf + i, sz - i, " %x", n->state); i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); i += scnprintf(buf + i, sz - i, " %x", n->action_flags); i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); i += scnprintf(buf + i, sz - i, " %u", n->sync_point); i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); i += scnprintf(buf + i, sz - i, " %u", n->working_links); i += scnprintf(buf + i, sz - i, " %x", n->capabilities); i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); if (!more) return i; i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[0].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[1].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "bclink:\n "); i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); return i; } void tipc_node_pre_cleanup_net(struct net *exit_net) { struct tipc_node *n; struct tipc_net *tn; struct net *tmp; rcu_read_lock(); for_each_net_rcu(tmp) { if (tmp == exit_net) continue; tn = tipc_net(tmp); if (!tn) continue; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_rcu(n, &tn->node_list, list) { if (!n->peer_net) continue; if (n->peer_net != exit_net) continue; tipc_node_write_lock(n); n->peer_net = NULL; n->peer_hash_mix = 0; tipc_node_write_unlock_fast(n); break; } spin_unlock_bh(&tn->node_list_lock); } rcu_read_unlock(); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/page_io.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, * Asynchronous swapping added 30.12.95. Stephen Tweedie * Removed race in async swapping. 14.4.1996. Bruno Haible * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman */ #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/swapops.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/psi.h> #include <linux/uio.h> #include <linux/sched/task.h> #include <linux/delayacct.h> #include <linux/zswap.h> #include "swap.h" static void __end_swap_bio_write(struct bio *bio) { struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { /* * We failed to write the page out to swap-space. * Re-dirty the page in order to avoid it being reclaimed. * Also print a dire warning that things will go BAD (tm) * very quickly. * * Also clear PG_reclaim to avoid folio_rotate_reclaimable() */ folio_mark_dirty(folio); pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); folio_clear_reclaim(folio); } folio_end_writeback(folio); } static void end_swap_bio_write(struct bio *bio) { __end_swap_bio_write(bio); bio_put(bio); } static void __end_swap_bio_read(struct bio *bio) { struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } else { folio_mark_uptodate(folio); } folio_unlock(folio); } static void end_swap_bio_read(struct bio *bio) { __end_swap_bio_read(bio); bio_put(bio); } int generic_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; unsigned blocks_per_page; unsigned long page_no; unsigned blkbits; sector_t probe_block; sector_t last_block; sector_t lowest_block = -1; sector_t highest_block = 0; int nr_extents = 0; int ret; blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; /* * Map all the blocks into the extent tree. This code doesn't try * to be very smart. */ probe_block = 0; page_no = 0; last_block = i_size_read(inode) >> blkbits; while ((probe_block + blocks_per_page) <= last_block && page_no < sis->max) { unsigned block_in_page; sector_t first_block; cond_resched(); first_block = probe_block; ret = bmap(inode, &first_block); if (ret || !first_block) goto bad_bmap; /* * It must be PAGE_SIZE aligned on-disk */ if (first_block & (blocks_per_page - 1)) { probe_block++; goto reprobe; } for (block_in_page = 1; block_in_page < blocks_per_page; block_in_page++) { sector_t block; block = probe_block + block_in_page; ret = bmap(inode, &block); if (ret || !block) goto bad_bmap; if (block != first_block + block_in_page) { /* Discontiguity */ probe_block++; goto reprobe; } } first_block >>= (PAGE_SHIFT - blkbits); if (page_no) { /* exclude the header page */ if (first_block < lowest_block) lowest_block = first_block; if (first_block > highest_block) highest_block = first_block; } /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ ret = add_swap_extent(sis, page_no, 1, first_block); if (ret < 0) goto out; nr_extents += ret; page_no++; probe_block += blocks_per_page; reprobe: continue; } ret = nr_extents; *span = 1 + highest_block - lowest_block; if (page_no == 0) page_no = 1; /* force Empty message */ sis->max = page_no; sis->pages = page_no - 1; out: return ret; bad_bmap: pr_err("swapon: swapfile has holes\n"); ret = -EINVAL; goto out; } static bool is_folio_zero_filled(struct folio *folio) { unsigned int pos, last_pos; unsigned long *data; unsigned int i; last_pos = PAGE_SIZE / sizeof(*data) - 1; for (i = 0; i < folio_nr_pages(folio); i++) { data = kmap_local_folio(folio, i * PAGE_SIZE); /* * Check last word first, incase the page is zero-filled at * the start and has non-zero data at the end, which is common * in real-world workloads. */ if (data[last_pos]) { kunmap_local(data); return false; } for (pos = 0; pos < last_pos; pos++) { if (data[pos]) { kunmap_local(data); return false; } } kunmap_local(data); } return true; } static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); set_bit(swp_offset(entry), sis->zeromap); } count_vm_events(SWPOUT_ZERO, nr_pages); if (objcg) { count_objcg_events(objcg, SWPOUT_ZERO, nr_pages); obj_cgroup_put(objcg); } } static void swap_zeromap_folio_clear(struct folio *folio) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); swp_entry_t entry; unsigned int i; for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); clear_bit(swp_offset(entry), sis->zeromap); } } /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) { int ret = 0; if (folio_free_swap(folio)) goto out_unlock; /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); goto out_unlock; } /* * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. * The bits in zeromap are protected by the locked swapcache folio * and atomic updates are used to protect against read-modify-write * corruption due to other zero swap entries seeing concurrent updates. */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); goto out_unlock; } /* * Clear bits this folio occupies in the zeromap to prevent zero data * being read in from any previous zero writes that occupied the same * swap entries. */ swap_zeromap_folio_clear(folio); if (zswap_store(folio)) { count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); goto out_unlock; } if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; } __swap_writepage(folio, swap_plug); return 0; out_unlock: folio_unlock(folio); return ret; } static inline void count_swpout_vm_event(struct folio *folio) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (unlikely(folio_test_pmd_mappable(folio))) { count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } #endif count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio)); count_vm_events(PSWPOUT, folio_nr_pages(folio)); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) { struct cgroup_subsys_state *css; struct mem_cgroup *memcg; memcg = folio_memcg(folio); if (!memcg) return; rcu_read_lock(); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } #else #define bio_associate_blkg_from_page(bio, folio) do { } while (0) #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ struct swap_iocb { struct kiocb iocb; struct bio_vec bvec[SWAP_CLUSTER_MAX]; int pages; int len; }; static mempool_t *sio_pool; int sio_pool_init(void) { if (!sio_pool) { mempool_t *pool = mempool_create_kmalloc_pool( SWAP_CLUSTER_MAX, sizeof(struct swap_iocb)); if (cmpxchg(&sio_pool, NULL, pool)) mempool_destroy(pool); } if (!sio_pool) return -ENOMEM; return 0; } static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); struct page *page = sio->bvec[0].bv_page; int p; if (ret != sio->len) { /* * In the case of swap-over-nfs, this can be a * temporary failure if the system has limited * memory for allocating transmit buffers. * Mark the page dirty and avoid * folio_rotate_reclaimable but rate-limit the * messages. */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, swap_dev_pos(page_swap_entry(page))); for (p = 0; p < sio->pages; p++) { page = sio->bvec[p].bv_page; set_page_dirty(page); ClearPageReclaim(page); } } for (p = 0; p < sio->pages; p++) end_page_writeback(sio->bvec[p].bv_page); mempool_free(sio, sio_pool); } static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); if (sio) { if (sio->iocb.ki_filp != swap_file || sio->iocb.ki_pos + sio->len != pos) { swap_write_unplug(sio); sio = NULL; } } if (!sio) { sio = mempool_alloc(sio_pool, GFP_NOIO); init_sync_kiocb(&sio->iocb, swap_file); sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = pos; sio->pages = 0; sio->len = 0; } bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); sio->len += folio_size(folio); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) { swap_write_unplug(sio); sio = NULL; } if (swap_plug) *swap_plug = sio; } static void swap_writepage_bdev_sync(struct folio *folio, struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP); bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(&bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); submit_bio_wait(&bio); __end_swap_bio_write(&bio); } static void swap_writepage_bdev_async(struct folio *folio, struct swap_info_struct *sis) { struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO); bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_write; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); submit_bio(bio); } void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(folio, swap_plug); /* * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) swap_writepage_bdev_sync(folio, sis); else swap_writepage_bdev_async(folio, sis); } void swap_write_unplug(struct swap_iocb *sio) { struct iov_iter from; struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); } static void sio_read_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); int p; if (ret == sio->len) { for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); folio_mark_uptodate(folio); folio_unlock(folio); } count_vm_events(PSWPIN, sio->pages); } else { for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); folio_unlock(folio); } pr_alert_ratelimited("Read-error on swap-device\n"); } mempool_free(sio, sio_pool); } static bool swap_read_folio_zeromap(struct folio *folio) { int nr_pages = folio_nr_pages(folio); struct obj_cgroup *objcg; bool is_zeromap; /* * Swapping in a large folio that is partially in the zeromap is not * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). */ if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages, &is_zeromap) != nr_pages)) return true; if (!is_zeromap) return false; objcg = get_obj_cgroup_from_folio(folio); count_vm_events(SWPIN_ZERO, nr_pages); if (objcg) { count_objcg_events(objcg, SWPIN_ZERO, nr_pages); obj_cgroup_put(objcg); } folio_zero_range(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); return true; } static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_iocb *sio = NULL; loff_t pos = swap_dev_pos(folio->swap); if (plug) sio = *plug; if (sio) { if (sio->iocb.ki_filp != sis->swap_file || sio->iocb.ki_pos + sio->len != pos) { swap_read_unplug(sio); sio = NULL; } } if (!sio) { sio = mempool_alloc(sio_pool, GFP_KERNEL); init_sync_kiocb(&sio->iocb, sis->swap_file); sio->iocb.ki_pos = pos; sio->iocb.ki_complete = sio_read_complete; sio->pages = 0; sio->len = 0; } bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); sio->len += folio_size(folio); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { swap_read_unplug(sio); sio = NULL; } if (plug) *plug = sio; } static void swap_read_folio_bdev_sync(struct folio *folio, struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ get_task_struct(current); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); put_task_struct(current); } static void swap_read_folio_bdev_async(struct folio *folio, struct swap_info_struct *sis) { struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; bool in_thrashing; VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); /* * Count submission time as memory stall and delay. When the device * is congested, or the submitting cgroup IO-throttled, submission * can be a significant part of overall IO time. */ if (workingset) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); } delayacct_swapin_start(); if (swap_read_folio_zeromap(folio)) { folio_unlock(folio); goto finish; } if (zswap_load(folio) != -ENOENT) goto finish; /* We have to read from slower devices. Increase zswap protection. */ zswap_folio_swapin(folio); if (data_race(sis->flags & SWP_FS_OPS)) { swap_read_folio_fs(folio, plug); } else if (synchronous) { swap_read_folio_bdev_sync(folio, sis); } else { swap_read_folio_bdev_async(folio, sis); } finish: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } delayacct_swapin_end(); } void __swap_read_unplug(struct swap_iocb *sio) { struct iov_iter from; struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); }
29 8 3 3 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/lockd/svc.c * * This is the central lockd service. * * FIXME: Separate the lockd NFS server functionality from the lockd NFS * client functionality. Oh why didn't Sun create two separate * services in the first place? * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/moduleparam.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/uio.h> #include <linux/smp.h> #include <linux/mutex.h> #include <linux/freezer.h> #include <linux/inetdevice.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <net/ip.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> #include "netns.h" #include "procfs.h" #include "netlink.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) static struct svc_program nlmsvc_program; const struct nlmsvc_binding *nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; static void nlmsvc_request_retry(struct timer_list *tl) { svc_wake_up(nlmsvc_serv); } DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry); unsigned int lockd_net_id; /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ static unsigned long nlm_grace_period; unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; /* * Constants needed for the sysctl interface. */ static const unsigned long nlm_grace_period_min = 0; static const unsigned long nlm_grace_period_max = 240; static const unsigned long nlm_timeout_min = 3; static const unsigned long nlm_timeout_max = 20; #ifdef CONFIG_SYSCTL static const int nlm_port_min = 0, nlm_port_max = 65535; static struct ctl_table_header * nlm_sysctl_table; #endif static unsigned long get_lockd_grace_period(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); /* Return the net-ns specific grace period, if there is one */ if (ln->gracetime) return ln->gracetime * HZ; /* Note: nlm_timeout should always be nonzero */ if (nlm_grace_period) return roundup(nlm_grace_period, nlm_timeout) * HZ; else return nlm_timeout * 5 * HZ; } static void grace_ender(struct work_struct *grace) { struct delayed_work *dwork = to_delayed_work(grace); struct lockd_net *ln = container_of(dwork, struct lockd_net, grace_period_end); locks_end_grace(&ln->lockd_manager); } static void set_grace_period(struct net *net) { unsigned long grace_period = get_lockd_grace_period(net); struct lockd_net *ln = net_generic(net, lockd_net_id); locks_start_grace(net, &ln->lockd_manager); cancel_delayed_work_sync(&ln->grace_period_end); schedule_delayed_work(&ln->grace_period_end, grace_period); } /* * This is the lockd kernel thread */ static int lockd(void *vrqstp) { struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); svc_thread_init_status(rqstp, 0); /* try_to_freeze() is called from svc_recv() */ set_freezable(); dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. */ while (!svc_thread_should_stop(rqstp)) { nlmsvc_retry_blocked(rqstp); svc_recv(rqstp); } if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); dprintk("lockd_down: service stopped\n"); svc_exit_thread(rqstp); return 0; } static int create_lockd_listener(struct svc_serv *serv, const char *name, struct net *net, const int family, const unsigned short port, const struct cred *cred) { struct svc_xprt *xprt; xprt = svc_find_xprt(serv, name, net, family, 0); if (xprt == NULL) return svc_xprt_create(serv, name, net, family, port, SVC_SOCK_DEFAULTS, cred); svc_xprt_put(xprt); return 0; } static int create_lockd_family(struct svc_serv *serv, struct net *net, const int family, const struct cred *cred) { struct lockd_net *ln = net_generic(net, lockd_net_id); int err; err = create_lockd_listener(serv, "udp", net, family, ln->udp_port ? ln->udp_port : nlm_udpport, cred); if (err < 0) return err; return create_lockd_listener(serv, "tcp", net, family, ln->tcp_port ? ln->tcp_port : nlm_tcpport, cred); } /* * Ensure there are active UDP and TCP listeners for lockd. * * Even if we have only TCP NFS mounts and/or TCP NFSDs, some * local services (such as rpc.statd) still require UDP, and * some NFS servers do not yet support NLM over TCP. * * Returns zero if all listeners are available; otherwise a * negative errno value is returned. */ static int make_socks(struct svc_serv *serv, struct net *net, const struct cred *cred) { static int warned; int err; err = create_lockd_family(serv, net, PF_INET, cred); if (err < 0) goto out_err; err = create_lockd_family(serv, net, PF_INET6, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_err; warned = 0; return 0; out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); svc_xprt_destroy_all(serv, net, true); return err; } static int lockd_up_net(struct svc_serv *serv, struct net *net, const struct cred *cred) { struct lockd_net *ln = net_generic(net, lockd_net_id); int error; if (ln->nlmsvc_users++) return 0; error = svc_bind(serv, net); if (error) goto err_bind; error = make_socks(serv, net, cred); if (error < 0) goto err_bind; set_grace_period(net); dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum); return 0; err_bind: ln->nlmsvc_users--; return error; } static void lockd_down_net(struct svc_serv *serv, struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", __func__, net->ns.inum); BUG(); } } static int lockd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inetaddr_notifier = { .notifier_call = lockd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int lockd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inet6addr_notifier = { .notifier_call = lockd_inet6addr_event, }; #endif static int lockd_get(void) { struct svc_serv *serv; int error; if (nlmsvc_serv) { nlmsvc_users++; return 0; } /* * Sanity check: if there's no pid, * we should be the first user ... */ if (nlmsvc_users) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; } error = svc_set_num_threads(serv, NULL, 1); if (error < 0) { svc_destroy(&serv); return error; } nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); nlmsvc_users++; return 0; } static void lockd_put(void) { if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n")) return; if (--nlmsvc_users) return; unregister_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif svc_set_num_threads(nlmsvc_serv, NULL, 0); timer_delete_sync(&nlmsvc_retry); svc_destroy(&nlmsvc_serv); dprintk("lockd_down: service destroyed\n"); } /* * Bring up the lockd process if it's not already up. */ int lockd_up(struct net *net, const struct cred *cred) { int error; mutex_lock(&nlmsvc_mutex); error = lockd_get(); if (error) goto err; error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { lockd_put(); goto err; } err: mutex_unlock(&nlmsvc_mutex); return error; } EXPORT_SYMBOL_GPL(lockd_up); /* * Decrement the user count and bring down lockd if we're the last. */ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); lockd_down_net(nlmsvc_serv, net); lockd_put(); mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL_GPL(lockd_down); #ifdef CONFIG_SYSCTL /* * Sysctl parameters (same as module parameters, different interface). */ static const struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_grace_period_min, .extra2 = (unsigned long *) &nlm_grace_period_max, }, { .procname = "nlm_timeout", .data = &nlm_timeout, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_timeout_min, .extra2 = (unsigned long *) &nlm_timeout_max, }, { .procname = "nlm_udpport", .data = &nlm_udpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nlm_tcpport", .data = &nlm_tcpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nsm_use_hostnames", .data = &nsm_use_hostnames, .maxlen = sizeof(bool), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", .data = &nsm_local_state, .maxlen = sizeof(nsm_local_state), .mode = 0644, .proc_handler = proc_douintvec, .extra1 = SYSCTL_ZERO, }, }; #endif /* CONFIG_SYSCTL */ /* * Module (and sysfs) parameters. */ #define param_set_min_max(name, type, which_strtol, min, max) \ static int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ char *endp; \ __typeof__(type) num = which_strtol(val, &endp, 0); \ if (endp == val || *endp || num < (min) || num > (max)) \ return -EINVAL; \ *((type *) kp->arg) = num; \ return 0; \ } static inline int is_callback(u32 proc) { return proc == NLMPROC_GRANTED || proc == NLMPROC_GRANTED_MSG || proc == NLMPROC_TEST_RES || proc == NLMPROC_LOCK_RES || proc == NLMPROC_CANCEL_RES || proc == NLMPROC_UNLOCK_RES || proc == NLMPROC_NSM_NOTIFY; } static enum svc_auth_status lockd_authenticate(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: rqstp->rq_auth_stat = rpc_auth_ok; if (rqstp->rq_proc == 0) return SVC_OK; if (is_callback(rqstp->rq_proc)) { /* Leave it to individual procedures to * call nlmsvc_lookup_host(rqstp) */ return SVC_OK; } return svc_set_client(rqstp); } rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } param_set_min_max(port, int, simple_strtol, 0, 65535) param_set_min_max(grace_period, unsigned long, simple_strtoul, nlm_grace_period_min, nlm_grace_period_max) param_set_min_max(timeout, unsigned long, simple_strtoul, nlm_timeout_min, nlm_timeout_max) MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION "."); MODULE_LICENSE("GPL"); module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong, &nlm_grace_period, 0644); module_param_call(nlm_timeout, param_set_timeout, param_get_ulong, &nlm_timeout, 0644); module_param_call(nlm_udpport, param_set_port, param_get_int, &nlm_udpport, 0644); module_param_call(nlm_tcpport, param_set_port, param_get_int, &nlm_tcpport, 0644); module_param(nsm_use_hostnames, bool, 0644); static int lockd_init_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); ln->lockd_manager.block_opens = false; INIT_LIST_HEAD(&ln->nsm_handles); return 0; } static void lockd_exit_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); WARN_ONCE(!list_empty(&ln->lockd_manager.list), "net %x %s: lockd_manager.list is not empty\n", net->ns.inum, __func__); WARN_ONCE(!list_empty(&ln->nsm_handles), "net %x %s: nsm_handles list is not empty\n", net->ns.inum, __func__); WARN_ONCE(delayed_work_pending(&ln->grace_period_end), "net %x %s: grace_period_end was not cancelled\n", net->ns.inum, __func__); } static struct pernet_operations lockd_net_ops = { .init = lockd_init_net, .exit = lockd_exit_net, .id = &lockd_net_id, .size = sizeof(struct lockd_net), }; /* * Initialising and terminating the module. */ static int __init init_nlm(void) { int err; #ifdef CONFIG_SYSCTL err = -ENOMEM; nlm_sysctl_table = register_sysctl("fs/nfs", nlm_sysctls); if (nlm_sysctl_table == NULL) goto err_sysctl; #endif err = register_pernet_subsys(&lockd_net_ops); if (err) goto err_pernet; err = genl_register_family(&lockd_nl_family); if (err) goto err_netlink; err = lockd_create_procfs(); if (err) goto err_procfs; return 0; err_procfs: genl_unregister_family(&lockd_nl_family); err_netlink: unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); err_sysctl: #endif return err; } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); genl_unregister_family(&lockd_nl_family); lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); #endif } module_init(init_nlm); module_exit(exit_nlm); /** * nlmsvc_dispatch - Process an NLM Request * @rqstp: incoming request * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ static int nlmsvc_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; *statp = procp->pc_func(rqstp); if (*statp == rpc_drop_reply) return 0; if (*statp != rpc_success) return 1; if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; return 1; out_decode_err: *statp = rpc_garbage_args; return 1; out_encode_err: *statp = rpc_system_err; return 1; } /* * Define NLM program and procedures */ static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]); static const struct svc_version nlmsvc_version1 = { .vs_vers = 1, .vs_nproc = 17, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version1_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]); static const struct svc_version nlmsvc_version3 = { .vs_vers = 3, .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #ifdef CONFIG_LOCKD_V4 static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]); static const struct svc_version nlmsvc_version4 = { .vs_vers = 4, .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4), .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, #ifdef CONFIG_LOCKD_V4 [4] = &nlmsvc_version4, #endif }; #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ .pg_nvers = NLM_NRVERS, /* number of entries in nlmsvc_version */ .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, }; /** * lockd_nl_server_set_doit - set the lockd server parameters via netlink * @skb: reply buffer * @info: netlink metadata and command arguments * * This updates the per-net values. When updating the values in the init_net * namespace, also update the "legacy" global values. * * Return 0 on success or a negative errno. */ int lockd_nl_server_set_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct lockd_net *ln = net_generic(net, lockd_net_id); const struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, LOCKD_A_SERVER_GRACETIME)) return -EINVAL; if (info->attrs[LOCKD_A_SERVER_GRACETIME] || info->attrs[LOCKD_A_SERVER_TCP_PORT] || info->attrs[LOCKD_A_SERVER_UDP_PORT]) { attr = info->attrs[LOCKD_A_SERVER_GRACETIME]; if (attr) { u32 gracetime = nla_get_u32(attr); if (gracetime > nlm_grace_period_max) return -EINVAL; ln->gracetime = gracetime; if (net == &init_net) nlm_grace_period = gracetime; } attr = info->attrs[LOCKD_A_SERVER_TCP_PORT]; if (attr) { ln->tcp_port = nla_get_u16(attr); if (net == &init_net) nlm_tcpport = ln->tcp_port; } attr = info->attrs[LOCKD_A_SERVER_UDP_PORT]; if (attr) { ln->udp_port = nla_get_u16(attr); if (net == &init_net) nlm_udpport = ln->udp_port; } } return 0; } /** * lockd_nl_server_get_doit - get lockd server parameters via netlink * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int lockd_nl_server_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct lockd_net *ln = net_generic(net, lockd_net_id); void *hdr; int err; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_iput(skb, info); if (!hdr) { err = -EMSGSIZE; goto err_free_msg; } err = nla_put_u32(skb, LOCKD_A_SERVER_GRACETIME, ln->gracetime) || nla_put_u16(skb, LOCKD_A_SERVER_TCP_PORT, ln->tcp_port) || nla_put_u16(skb, LOCKD_A_SERVER_UDP_PORT, ln->udp_port); if (err) goto err_free_msg; genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_free_msg: nlmsg_free(skb); return err; }
23 21 21 3 17 11 1 2 2 3 2 1 2 2 1 2 17 17 17 1 1 3 11 11 2 3 1 23 2 26 26 23 23 6 1 2 17 11 2 1 2 3 2 2 2 17 4 11 2 2 1 1 2 1 1 8 11 11 11 11 1 1 1 4 17 17 1 17 17 11 11 10 4 4 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for SCTP. * * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_NONE] = "NONE", [SCTP_CONNTRACK_CLOSED] = "CLOSED", [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT #define sCE SCTP_CONNTRACK_COOKIE_ECHOED #define sES SCTP_CONNTRACK_ESTABLISHED #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sIV SCTP_CONNTRACK_MAX /* These are the descriptions of the states: NOTE: These state names are tantalizingly similar to the states of an SCTP endpoint. But the interpretation of the states is a little different, considering that these are the states of the connection and not of an end point. Please note the subtleties. -Kiran NONE - Nothing so far. COOKIE WAIT - We have seen an INIT chunk in the original direction, or also an INIT_ACK chunk in the reply direction. COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. */ /* TODO - I have assumed that the first INIT is in the original direction. This messes things when an INIT comes in the reply direction in CLOSED state. - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - Full Multi Homing support. */ /* SCTP conntrack state transitions */ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ /* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ /* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, /* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #endif /* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned long *map, const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; int flag; flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { nf_ct_l4proto_log_invalid(skb, ct, state, "%s failed. chunk num %d, type %d, len %d flag %d\n", __func__, count, sch->type, sch->length, flag); return 1; } if (map) set_bit(sch->type, map); } return count == 0; } static int sctp_new_state(enum ip_conntrack_dir dir, enum sctp_conntrack cur_state, int chunk_type) { int i; switch (chunk_type) { case SCTP_CID_INIT: i = 0; break; case SCTP_CID_INIT_ACK: i = 1; break; case SCTP_CID_ABORT: i = 2; break; case SCTP_CID_SHUTDOWN: i = 3; break; case SCTP_CID_SHUTDOWN_ACK: i = 4; break; case SCTP_CID_ERROR: i = 5; break; case SCTP_CID_COOKIE_ECHO: i = 6; break; case SCTP_CID_COOKIE_ACK: i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: i = 8; break; case SCTP_CID_HEARTBEAT: i = 9; break; case SCTP_CID_HEARTBEAT_ACK: i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type %d, Will stay in %s\n", chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } return sctp_conntracks[dir][i][cur_state]; } /* Don't need lock here: this conntrack not in circulation yet */ static noinline bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct sctphdr *sh, unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u32 offset, count; memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ if (new_state == SCTP_CONNTRACK_NONE || new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return false; } /* Copy the vtag into the state info */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _inithdr, *ih; /* Sec 8.5.1 (A) */ if (sh->vtag) return false; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(_inithdr), &_inithdr); if (!ih) return false; pr_debug("Setting vtag %x for new conn\n", ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; } static bool sctp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { const struct sctphdr *sh; const char *logmsg; if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; } skb->ip_summed = CHECKSUM_UNNECESSARY; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); const struct sctphdr *sh; struct sctphdr _sctph; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ if (test_bit(SCTP_CID_ABORT, map) || test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || test_bit(SCTP_CID_COOKIE_ACK, map)) return -NF_ACCEPT; if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; } /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { nf_ct_l4proto_log_invalid(skb, ct, state, "verification tag check failed %x vs %x for dir %d", sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* (B) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* (C) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ACK) { ct->proto.sctp.init[dir] = 0; ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.last_dir = dir; ignore = true; continue; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || ct->proto.sctp.last_dir == dir) goto out_unlock; ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.vtag[dir] = sh->vtag; ct->proto.sctp.vtag[!dir] = 0; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } old_state = ct->proto.sctp.state; new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { nf_ct_l4proto_log_invalid(skb, ct, state, "Invalid, old_state %d, dir %d, type %d", old_state, dir, sch->type); goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _ih, *ih; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) ct->proto.sctp.init[!dir] = 0; ct->proto.sctp.init[dir] = 1; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so * port reuse by client or NAT middlebox cannot * keep entry alive indefinitely (incl. nat info). */ if (new_state == SCTP_CONNTRACK_CLOSED && old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; } else if (sch->type == SCTP_CID_INIT_ACK) { struct sctp_inithdr _ih, *ih; __be32 vtag; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; vtag = ct->proto.sctp.vtag[!dir]; if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) goto out_unlock; /* collision */ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && vtag != ih->init_tag) goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (new_state == SCTP_CONNTRACK_ESTABLISHED && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } } spin_unlock_bh(&ct->lock); /* allow but do not refresh timeout */ if (ignore) return NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; out_unlock: spin_unlock_bh(&ct->lock); out: return -NF_ACCEPT; } static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { case SCTP_CONNTRACK_SHUTDOWN_SENT: case SCTP_CONNTRACK_SHUTDOWN_RECD: case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; #define SCTP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 4) + \ NLA_ALIGN(NLA_HDRLEN + 4)) static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; int err; /* updates may not contain the internal protocol info, skip parsing */ if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, sctp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_SCTP_STATE] || !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) return -EINVAL; spin_lock_bh(&ct->lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; if (!timeouts) timeouts = sn->timeouts; /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } static int sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; /* timeouts[0] is unused, init it so ->timeouts[0] contains * 'new' timeout, like udp or icmp. */ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .can_early_drop = sctp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_SCTP_MAX, .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> #include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/dax.h> #include <linux/ksm.h> #include <linux/pgalloc.h> #include <asm/tlb.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_NO_PTE_TABLE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, SCAN_PAGE_COUNT, SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, }; #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); /* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static unsigned long khugepaged_sleep_expire; static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); /* * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. * * Note that these are only respected if collapse was initiated by khugepaged. */ unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; /* nodemask for allocation fallback */ nodemask_t alloc_nmask; }; /** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * * There is only the one khugepaged_scan instance of this cursor structure. */ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; }; static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; #ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } static ssize_t __sleep_millisecs_store(const char *buf, size_t count, unsigned int *millisecs) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); } static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int pages; int err; err = kstrtouint(buf, 10, &pages); if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; return count; } static struct kobj_attribute pages_to_scan_attr = __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over * any unmapped ptes in turn potentially increasing the memory * footprint of the vmas. When max_ptes_none is 0 khugepaged will not * reduce the available free memory in the system as it * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ static ssize_t max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } static ssize_t max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = __ATTR_RW(max_ptes_none); static ssize_t max_ptes_swap_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } static ssize_t max_ptes_swap_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; return count; } static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR_RW(max_ptes_swap); static ssize_t max_ptes_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } static ssize_t max_ptes_shared_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); if (err || max_ptes_shared > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; return count; } static struct kobj_attribute khugepaged_max_ptes_shared_attr = __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, &khugepaged_max_ptes_swap_attr.attr, &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, NULL, }; struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ static bool pte_none_or_zero(pte_t pte) { if (pte_none(pte)) return true; return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); } int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: #ifdef CONFIG_S390 /* * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 * can't handle this properly after s390_enable_sie, so we simply * ignore the madvise to prevent qemu from causing a SIGSEGV. */ if (mm_has_pgste(vma->vm_mm)) return 0; #endif *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* * If the vma become good for khugepaged to scan, * register it here without waiting a page fault that * may not happen any time soon. */ khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; /* * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning * this vma even if we leave the mm registered in khugepaged if * it got registered before VM_NOHUGEPAGE was set. */ break; } return 0; } int __init khugepaged_init(void) { mm_slot_cache = KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } void __init khugepaged_destroy(void) { kmem_cache_destroy(mm_slot_cache); } static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) { /* * We cover the anon, shmem and the file-backed case here; file-backed * hugepages, when configured in, are determined by the global control. * Anon pmd-sized hugepages are determined by the pmd-size control. * Shmem pmd-sized hugepages are also determined by its pmd-size control, * except when the global shmem_huge is set to SHMEM_HUGE_DENY. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && hugepage_global_enabled()) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_always)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && hugepage_global_enabled()) return true; if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) return true; return false; } void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; slot = mm_slot_alloc(mm_slot_cache); if (!slot) return; spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); } void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); if (slot && khugepaged_scan.mm_slot != slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } else if (slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); } } static void release_pte_folio(struct folio *folio) { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); folio_unlock(folio); folio_putback_lru(folio); } static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) continue; VM_WARN_ON_ONCE(!pte_present(pteval)); pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; folio = pfn_folio(pfn); if (folio_test_large(folio)) continue; release_pte_folio(folio); } list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { list_del(&folio->lru); release_pte_folio(folio); } } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long start_addr, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } if (!pte_present(pteval)) { result = SCAN_PTE_NON_PRESENT; goto out; } if (pte_uffd_wp(pteval)) { result = SCAN_PTE_UFFD_WP; goto out; } page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); /* See hpage_collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } } if (folio_test_large(folio)) { struct folio *f; /* * Check if we have dealt with the compound page * already */ list_for_each_entry(f, compound_pagelist, lru) { if (folio == f) goto next; } } /* * We can do it before folio_isolate_lru because the * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } /* * Check if the page has any GUP (or other external) pins. * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (!folio_isolate_lru(folio)) { folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (folio_test_large(folio)) list_add_tail(&folio->lru, compound_pagelist); next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, result); return result; } static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned long end = address + HPAGE_PMD_SIZE; struct folio *src, *tmp; pte_t pteval; pte_t *_pte; unsigned int nr_ptes; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, address += nr_ptes * PAGE_SIZE) { nr_ptes = 1; pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (pte_none(pteval)) continue; /* * ptl mostly unnecessary. */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { struct page *src_page = pte_page(pteval); src = page_folio(src_page); if (folio_test_large(src)) { unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); } else { release_pte_folio(src); } /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); clear_ptes(vma->vm_mm, address, _pte, nr_ptes); folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); free_swap_cache(src); folio_put_refs(src, nr_ptes); } } list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { list_del(&src->lru); node_stat_sub_folio(src, NR_ISOLATED_ANON + folio_is_file_lru(src)); folio_unlock(src); free_swap_cache(src); folio_putback_lru(src); } } static void __collapse_huge_page_copy_failed(pte_t *pte, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, struct list_head *compound_pagelist) { spinlock_t *pmd_ptl; /* * Re-establish the PMD to point to the original page table * entry. Restoring PMD needs to be done prior to releasing * pages. Since pages are still isolated and locked here, * acquiring anon_vma_lock_write is unnecessary. */ pmd_ptl = pmd_lock(vma->vm_mm, pmd); pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); spin_unlock(pmd_ptl); /* * Release both raw and compound pages isolated * in __collapse_huge_page_isolate. */ release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); } /* * __collapse_huge_page_copy - attempts to copy memory contents from raw * pages to a hugepage. Cleans up the raw pages if copying succeeds; * otherwise restores the original page table and releases isolated raw pages. * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from * @folio: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area * @address: starting address to copy * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned int i; int result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. */ for (i = 0; i < HPAGE_PMD_NR; i++) { pte_t pteval = ptep_get(pte + i); struct page *page = folio_page(folio, i); unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; if (pte_none_or_zero(pteval)) { clear_user_highpage(page, src_addr); continue; } src_page = pte_page(pteval); if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { result = SCAN_COPY_MC; break; } } if (likely(result == SCAN_SUCCEED)) __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, compound_pagelist); else __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, compound_pagelist); return result; } static void khugepaged_alloc_sleep(void) { DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; /* * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; } #define khugepaged_defrag() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; } #ifdef CONFIG_NUMA static int hpage_collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) if (cc->node_load[nid] > max_value) { max_value = cc->node_load[nid]; target_node = nid; } for_each_online_node(nid) { if (max_value == cc->node_load[nid]) node_set(nid, cc->alloc_nmask); } return target_node; } #else static int hpage_collapse_find_target_node(struct collapse_control *cc) { return 0; } #endif /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { struct vm_area_struct *vma; enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) return SCAN_PAGE_ANON; return SCAN_SUCCEED; } static inline int check_pmd_state(pmd_t *pmd) { pmd_t pmde = pmdp_get_lockless(pmd); if (pmd_none(pmde)) return SCAN_NO_PTE_TABLE; /* * The folio may be under migration when khugepaged is trying to * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again. */ if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_NO_PTE_TABLE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) return SCAN_NO_PTE_TABLE; return SCAN_SUCCEED; } static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) { *pmd = mm_find_pmd(mm, address); if (!*pmd) return SCAN_NO_PTE_TABLE; return check_pmd_state(*pmd); } static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) { pmd_t *new_pmd; int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); if (result != SCAN_SUCCEED) return result; if (new_pmd != pmd) return SCAN_FAIL; return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; for (addr = start_addr; addr < end; addr += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, .address = addr, .pgoff = linear_page_index(vma, addr), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; if (!pte++) { /* * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_NO_PTE_TABLE; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); if (pte_none(vmf.orig_pte) || pte_present(vmf.orig_pte)) continue; vmf.pte = pte; vmf.ptl = ptl; ret = do_swap_page(&vmf); /* Which unmaps pte (after perhaps re-checking the entry) */ pte = NULL; /* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because * we do not retry here and swap entry will remain in pagetable * resulting in later failure. */ if (ret & VM_FAULT_RETRY) { /* Likely, but not guaranteed, that page lock failed */ result = SCAN_PAGE_LOCK; goto out; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); result = SCAN_FAIL; goto out; } swapped_in++; } if (pte) pte_unmap(pte); /* Drain LRU cache to remove extra pin on the swapped in pages */ if (swapped_in) lru_add_drain(); result = SCAN_SUCCEED; out: trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); return result; } static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); int node = hpage_collapse_find_target_node(cc); struct folio *folio; folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); if (!folio) { *foliop = NULL; count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *foliop = NULL; return SCAN_CGROUP_CHARGE_FAIL; } count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); *foliop = folio; return SCAN_SUCCEED; } static int collapse_huge_page(struct mm_struct *mm, unsigned long address, int referenced, int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); result = alloc_charge_folio(&folio, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } if (unmapped) { /* * __collapse_huge_page_swapin will return with mmap_lock * released when it fails. So we jump out_nolock directly in * that case. Continuing to collapse causes inconsistency. */ result = __collapse_huge_page_swapin(mm, vma, address, pmd, referenced); if (result != SCAN_SUCCEED) goto out_nolock; } mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. * * UFFDIO_MOVE is prevented to race as well thanks to the * mmap_lock. */ mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ vma_start_write(vma); result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * * Parallel GUP-fast is fine since GUP-fast will back off when * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, &compound_pagelist); spin_unlock(pte_ptl); } else { result = SCAN_NO_PTE_TABLE; } if (unlikely(result != SCAN_SUCCEED)) { if (pte) pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing * hugepmds and never for establishing regular pmds that * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out_up_write; } /* * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ anon_vma_unlock_write(vma->anon_vma); result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; /* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); pgtable_trans_huge_deposit(mm, pmd, pgtable); map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: if (folio) folio_put(folio); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } continue; } else { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple. */ result = SCAN_PTE_UFFD_WP; goto out_unmap; } page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } folio = page_folio(page); if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; } /* * We treat a single page as shared if any part of the THP * is shared. */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } } /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; } out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } static void collect_mm_slot(struct mm_slot *slot) { struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. * * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } /* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio, struct page *page) { struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, }; pgd_t *pgdp; p4d_t *p4dp; pud_t *pudp; mmap_assert_locked(vma->vm_mm); if (!pmdp) { pgdp = pgd_offset(mm, addr); p4dp = p4d_alloc(mm, pgdp, addr); if (!p4dp) return SCAN_FAIL; pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return SCAN_FAIL; pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) return SCAN_FAIL; } vmf.pmd = pmdp; if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; folio_get(folio); return SCAN_SUCCEED; } /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. * * @mm: process address space where collapse happens * @addr: THP collapse address * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { int nr_mapped_ptes = 0, result = SCAN_FAIL; unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; unsigned long end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; int i; mmap_assert_locked(mm); /* First check VMA found, in case page tables are being torn down */ if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return SCAN_VMA_CHECK; /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result; /* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here and force collapse. */ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; folio = filemap_lock_folio(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (IS_ERR(folio)) return SCAN_PAGE_NULL; if (folio_order(folio) != HPAGE_PMD_ORDER) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; case SCAN_NO_PTE_TABLE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. */ goto maybe_install_pmd; default: goto drop_folio; } result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto drop_folio; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; pte_t ptent = ptep_get(pte); /* empty pte, skip */ if (pte_none(ptent)) continue; /* page swapped out, abort */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. */ if (folio_page(folio, i) != page) goto abort; } pte_unmap_unlock(start_pte, ptl); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); notified = true; /* * pmd_lock covers a wider range than ptl, and (if split from mm's * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now. */ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) spin_lock(ptl); else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) goto abort; /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, pte += nr_batch_ptes) { unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page; pte_t ptent = ptep_get(pte); nr_batch_ptes = 1; if (pte_none(ptent)) continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated. */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (folio_page(folio, i) != page) goto abort; nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ clear_ptes(mm, addr, pte, nr_batch_ptes); folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); nr_mapped_ptes += nr_batch_ptes; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_mapped_ptes) { folio_ref_sub(folio, nr_mapped_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } /* step 4: remove empty page table */ if (!pml) { pml = pmd_lock(mm, pmd); if (ptl != pml) { spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { flush_tlb_mm(mm); goto unlock; } } } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); pte_unmap_unlock(start_pte, ptl); if (ptl != pml) spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, haddr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: if (nr_mapped_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_mapped_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } unlock: if (start_pte) pte_unmap_unlock(start_pte, ptl); if (pml && pml != ptl) spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); drop_folio: folio_unlock(folio); folio_put(folio); return result; } /* Can we retract page tables for this file-backed VMA? */ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) { /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth removing * page tables from, as PMD-mapping is likely to be split later. */ if (READ_ONCE(vma->anon_vma)) return false; /* * When a vma is registered with uffd-wp, we cannot recycle * the page table because there may be pte markers installed. * Other vmas can still have the same file mapped hugely, but * skip this one: it will always be mapped in small page size * for uffd-wp registered ranges. */ if (userfaultfd_wp(vma)) return false; /* * If the VMA contains guard regions then we can't collapse it. * * This is set atomically on guard marker installation under mmap/VMA * read lock, and here we may not hold any VMA or mmap lock at all. * * This is therefore serialised on the PTE page table lock, which is * obtained on guard region installation after the flag is set, so this * check being performed under this lock excludes races. */ if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) return false; return true; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { struct mmu_notifier_range range; struct mm_struct *mm; unsigned long addr; pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; bool success = false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; if (hpage_collapse_test_exit(mm)) continue; if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); /* * The lock of new_folio is still held, we will be blocked in * the page fault path, which prevents the pte entries from * being set again. So even though the old empty PTE page may be * concurrently freed and a new PTE page is filled into the pmd * entry, it is still empty and can be removed. * * So here we only need to recheck if the state of pmd entry * still meets our requirements, rather than checking pmd_same() * like elsewhere. */ if (check_pmd_state(pmd) != SCAN_SUCCEED) goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* * Huge page lock is still held, so normally the page table must * remain empty; and we have already skipped anon_vma and * userfaultfd_wp() vmas. But since the mmap_lock is not held, * it is still possible for a racing userfaultfd_ioctl() or * madvise() to have inserted ptes or markers. Now that we hold * ptlock, repeating the retractable checks protects us from * races against the prior checks. */ if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; } if (ptl != pml) spin_unlock(ptl); drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } } i_mmap_unlock_read(mapping); } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache, locking old pages * + swap/gup in pages if necessary; * - copy data to new page * - handle shmem holes * + re-validate that holes weren't filled by someone else * + check for userfaultfd * - finalize updates to the page cache; * - if replacing succeeds: * + unlock huge page; * + free old pages; * - if replacing failed; * + unlock old pages * + unlock and free huge page; */ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *dst; struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out; mapping_set_update(&xas, mapping); __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); new_folio->index = start; new_folio->mapping = mapping; /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present */ do { xas_lock_irq(&xas); xas_create_range(&xas); if (!xas_error(&xas)) break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { result = SCAN_FAIL; goto rollback; } } while (1); for (index = start; index < end;) { xas_set(&xas, index); folio = xas_load(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { if (!folio) { /* * Stop if extent has been truncated or * hole-punched, and is now completely * empty. */ if (index == start) { if (!xas_next_entry(&xas, end - 1)) { result = SCAN_TRUNCATED; goto xa_locked; } } nr_none++; index++; continue; } if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ if (!folio || xa_is_value(folio)) { xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { result = SCAN_FAIL; goto xa_unlocked; } } else if (folio_test_dirty(folio)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't * been flushed since first write. There * won't be new dirty pages. * * Trigger async flush here and hope the * writeback is done when khugepaged * revisits this page. * * This is a one-off situation. We are not * forcing writeback in loop. */ xas_unlock_irq(&xas); filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); result = SCAN_FAIL; goto xa_unlocked; } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } /* * The folio must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); /* make sure the folio is up to date */ if (unlikely(!folio_test_uptodate(folio))) { result = SCAN_FAIL; goto out_unlock; } /* * If file was truncated then extended, or hole-punched, before * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } if (!is_shmem && (folio_test_dirty(folio) || folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * folio is dirty because it hasn't been flushed * since first write. */ result = SCAN_FAIL; goto out_unlock; } if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; } if (folio_mapped(folio)) try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* * We control 2 + nr_pages references to the folio: * - we hold a pin on it; * - nr_pages reference from page cache; * - one from lru_isolate_folio; * If those are the only references, then any new usage * of the folio will have to fetch it from the page * cache. That requires locking the folio to handle * truncate, so any new usage will be blocked until we * unlock folio after collapse/during rollback. */ if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); folio_putback_lru(folio); goto out_unlock; } /* * Accumulate the folios that are being collapsed. */ list_add_tail(&folio->lru, &pagelist); index += folio_nr_pages(folio); continue; out_unlock: folio_unlock(folio); folio_put(folio); goto xa_unlocked; } if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure i_writecount is up to date and the update to nr_thps * is visible. Ensures the page cache will be truncated if the * file is opened writable. */ smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; filemap_nr_thps_dec(mapping); } } xa_locked: xas_unlock_irq(&xas); xa_unlocked: /* * If collapse is successful, flush must be done now before copying. * If collapse is unsuccessful, does flush actually need to be done? * Do it anyway, to clear the state. */ try_to_unmap_flush(); if (result == SCAN_SUCCEED && nr_none && !shmem_charge(mapping->host, nr_none)) result = SCAN_FAIL; if (result != SCAN_SUCCEED) { nr_none = 0; goto rollback; } /* * The old folios are locked, so they won't change anymore. */ index = start; dst = folio_page(new_folio, 0); list_for_each_entry(folio, &pagelist, lru) { int i, nr_pages = folio_nr_pages(folio); while (index < folio->index) { clear_highpage(dst); index++; dst++; } for (i = 0; i < nr_pages; i++) { if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { result = SCAN_COPY_MC; goto rollback; } index++; dst++; } } while (index < end) { clear_highpage(dst); index++; dst++; } if (nr_none) { struct vm_area_struct *vma; int nr_none_check = 0; i_mmap_lock_read(mapping); xas_lock_irq(&xas); xas_set(&xas, start); for (index = start; index < end; index++) { if (!xas_next(&xas)) { xas_store(&xas, XA_RETRY_ENTRY); if (xas_error(&xas)) { result = SCAN_STORE_FAILED; goto immap_locked; } nr_none_check++; } } if (nr_none != nr_none_check) { result = SCAN_PAGE_FILLED; goto immap_locked; } /* * If userspace observed a missing page in a VMA with * a MODE_MISSING userfaultfd, then it might expect a * UFFD_EVENT_PAGEFAULT for that page. If so, we need to * roll back to avoid suppressing such an event. Since * wp/minor userfaultfds don't give userspace any * guarantees that the kernel doesn't fill a missing * page with a zero page, so they don't matter here. * * Any userfaultfds registered after this point will * not be able to observe any missing pages due to the * previously inserted retry entries. */ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) { result = SCAN_EXCEED_NONE_PTE; goto immap_locked; } } immap_locked: i_mmap_unlock_read(mapping); if (result != SCAN_SUCCEED) { xas_set(&xas, start); for (index = start; index < end; index++) { if (xas_next(&xas) == XA_RETRY_ENTRY) xas_store(&xas, NULL); } xas_unlock_irq(&xas); goto rollback; } } else { xas_lock_irq(&xas); } if (is_shmem) lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* * Mark new_folio as uptodate before inserting it into the * page cache so that it isn't mistaken for an fallocated but * unwritten page. */ folio_mark_uptodate(new_folio); folio_ref_add(new_folio, HPAGE_PMD_NR - 1); if (is_shmem) folio_mark_dirty(new_folio); folio_add_lru(new_folio); /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); xas_store(&xas, new_folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); /* * Remove pte page tables, so we can re-fault the page as huge. * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; folio_unlock(new_folio); /* * The collapse has succeeded, so free the old folios. */ list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); folio->mapping = NULL; folio_clear_active(folio); folio_clear_unevictable(folio); folio_unlock(folio); folio_put_refs(folio, 2 + folio_nr_pages(folio)); } goto out; rollback: /* Something went wrong: roll back page cache changes */ if (nr_none) { xas_lock_irq(&xas); mapping->nrpages -= nr_none; xas_unlock_irq(&xas); shmem_uncharge(mapping->host, nr_none); } list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); folio_unlock(folio); folio_putback_lru(folio); folio_put(folio); } /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM * file only. This undo is not needed unless failure is * due to SCAN_COPY_MC. */ if (!is_shmem && result == SCAN_COPY_MC) { filemap_nr_thps_dec(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure the update to nr_thps is visible. */ smp_mb(); } new_folio->mapping = NULL; folio_unlock(new_folio); folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); return result; } static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; present = 0; swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); rcu_read_lock(); xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) { swap += 1 << xas_get_order(&xas); if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } if (!folio_try_get(folio)) { xas_reset(&xas); continue; } if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); xas_reset(&xas); continue; } if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so * it's safe to skip LRU and refcount checks before * returning. */ folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; folio_put(folio); break; } if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; folio_put(folio); break; } /* * We probably should check if the folio is referenced * here, but nobody would transfer pte_young() to * folio_test_referenced() for us. And rmap walk here * is just too costly... */ present += folio_nr_pages(folio); folio_put(folio); if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { result = collapse_file(mm, addr, file, start, cc); } } trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { slot = khugepaged_scan.mm_slot; } else { slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. */ vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; progress++; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { bool mmap_locked = true; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); mmap_read_unlock(mm); mmap_locked = false; *result = hpage_collapse_scan_file(mm, khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; mmap_read_unlock(mm); } } else { *result = hpage_collapse_scan_pmd(mm, vma, khugepaged_scan.address, &mmap_locked, cc); } if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; if (!mmap_locked) /* * We released mmap_lock so break loop. Note * that we drop mmap_lock before all hugepage * allocations, so if allocation fails, we are * guaranteed to break here and report the * correct result back to caller. */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; } } breakouterloop: mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } collect_mm_slot(slot); } return progress; } static int khugepaged_has_work(void) { return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || kthread_should_stop(); } static void khugepaged_do_scan(struct collapse_control *cc) { unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; int result = SCAN_SUCCEED; lru_add_drain_all(); while (true) { cond_resched(); if (unlikely(kthread_should_stop())) break; spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); if (progress >= pages) break; if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { /* * If fail to allocate the first time, try to sleep for * a while. When hit again, cancel the scan. */ if (!wait) break; wait = false; khugepaged_alloc_sleep(); } } } static bool khugepaged_should_wakeup(void) { return kthread_should_stop() || time_after_eq(jiffies, khugepaged_sleep_expire); } static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { const unsigned long scan_sleep_jiffies = msecs_to_jiffies(khugepaged_scan_sleep_millisecs); if (!scan_sleep_jiffies) return; khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, khugepaged_should_wakeup(), scan_sleep_jiffies); return; } if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) { struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; if (slot) collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; unsigned long recommended_min; if (!hugepage_pmd_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of * ZONE_MOVABLE since it only has movable pages. */ if (zone_idx(zone) > gfp_zone(GFP_USER)) continue; nr_zones++; } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* * Make sure that on average at least two pageblocks are almost free * of another type, one for a migratetype to fall back to and a * second to avoid subsequent fallbacks of other types There are 3 * MIGRATE_TYPES we care about. */ recommended_min += pageblock_nr_pages * nr_zones * MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; /* don't ever allow to reserve more than 5% of the lowmem */ recommended_min = min(recommended_min, (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; } update_wmarks: setup_per_zone_wmarks(); } int start_stop_khugepaged(void) { int err = 0; mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (IS_ERR(khugepaged_thread)) { pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; } void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } bool current_is_khugepaged(void) { return kthread_func(current) == khugepaged; } static int madvise_collapse_errno(enum scan_result r) { /* * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide * actionable feedback to caller, so they may take an appropriate * fallback measure depending on the nature of the failure. */ switch (r) { case SCAN_ALLOC_HUGE_PAGE_FAIL: return -ENOMEM; case SCAN_CGROUP_CHARGE_FAIL: case SCAN_EXCEED_NONE_PTE: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to * specified memory range. khugepaged likely won't be able to collapse * either. */ default: return -EINVAL; } } int madvise_collapse(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; unsigned long hstart, hend, addr; int thps = 0, last_fail = SCAN_FAIL; bool mmap_locked = true; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); if (!cc) return -ENOMEM; cc->is_khugepaged = false; mmgrab(mm); lru_add_drain_all(); hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = end & HPAGE_PMD_MASK; for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { int result = SCAN_FAIL; if (!mmap_locked) { cond_resched(); mmap_read_lock(mm); mmap_locked = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; } hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); mmap_read_unlock(mm); mmap_locked = false; result = hpage_collapse_scan_file(mm, addr, file, pgoff, cc); fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); } if (!mmap_locked) *lock_dropped = true; handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: last_fail = result; /* Other error, exit */ goto out_maybelock; } } out_maybelock: /* Caller expects us to hold mmap_lock on return */ if (!mmap_locked) mmap_read_lock(mm); out_nolock: mmap_assert_locked(mm); mmdrop(mm); kfree(cc); return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 : madvise_collapse_errno(last_fail); }
24 1 1 22 2 18 1 27 2 4 23 26 12 1 3 2 1 5 11 153 153 114 16 57 57 57 19 51 6 6 72 73 73 73 145 145 119 33 145 144 144 127 18 144 143 2 130 16 145 33 31 30 2 31 4 118 140 1 139 126 16 127 15 7 15 120 3 1 3 1 4 2 2 4 4 4 3 3 1 4 1 3 1 1 8 13 18 8 30 10 20 4 4 1 3 62 62 3 1 4 53 53 4 4 46 39 9 8 152 20 10 51 72 2 2 158 31 129 4 5 1 153 8 61 89 153 161 161 124 49 32 32 24 9 12 3 12 51 32 26 2 88 87 1 6 82 82 62 27 60 2 68 68 55 14 20 32 6 19 7 1 6 23 30 14 32 1 1 30 2 2 1 24 6 6 6 6 6 33 1 23 9 6 52 63 121 73 20 4 24 4 23 13 50 23 12 30 20 1 4 16 16 8 3 16 16 20 8 12 19 19 19 19 18 19 19 19 10 3 9 6 11 15 15 1 3 9 4 5 1 3 4 4 22 1 2 19 1 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) /* This mutex is used to * - protect race between prog/link attach/detach and link prog update, and * - protect race between releasing and accessing map in bpf_link. * A single global mutex lock is used since it is expected contention is low. */ static DEFINE_MUTEX(sockmap_mutex); static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_map *map; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { struct bpf_prog *prog; struct bpf_map *map; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); break; } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { if (sk_is_tcp(sk)) ret = sk_psock_init_strp(sk, psock); else ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk = NULL; int err = 0; spin_lock_bh(&stab->lock); if (!sk_test || sk_test == *psk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; if (sk_is_vsock(sk) && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; if (sk_is_vsock(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } cond_resched(); } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; if (sk_is_vsock(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **cur_pprog; struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: cur_pprog = &progs->msg_parser; cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: cur_pprog = &progs->stream_parser; cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; cur_pprog = &progs->stream_verdict; cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; cur_pprog = &progs->skb_verdict; cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } *pprog = cur_pprog; if (plink) *plink = cur_plink; return 0; } /* Handle the following four cases: * prog_attach: prog != NULL, old == NULL, link == NULL * prog_detach: prog == NULL, old != NULL, link == NULL * link_attach: prog != NULL, old == NULL, link != NULL * link_detach: prog == NULL, old != NULL, link != NULL */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which) { struct bpf_prog **pprog; struct bpf_link **plink; int ret; ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; /* for prog_attach/prog_detach/link_attach, return error if a bpf_link * exists for that prog. */ if ((!link || prog) && *plink) return -EBUSY; if (old) { ret = psock_replace_prog(pprog, prog, old); if (!ret) *plink = NULL; } else { psock_set_prog(pprog, prog); if (link) *plink = link; } return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); psock = sk_psock_get(sk); if (unlikely(!psock)) goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } else { saved_close = READ_ONCE(sk->sk_prot)->close; no_psock: rcu_read_unlock(); release_sock(sk); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); struct sockmap_link { struct bpf_link link; struct bpf_map *map; }; static void sock_map_link_release(struct bpf_link *link) { struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); mutex_lock(&sockmap_mutex); if (!sockmap_link->map) goto out; WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, link->attach_type)); bpf_map_put_with_uref(sockmap_link->map); sockmap_link->map = NULL; out: mutex_unlock(&sockmap_mutex); } static int sock_map_link_detach(struct bpf_link *link) { sock_map_link_release(link); return 0; } static void sock_map_link_dealloc(struct bpf_link *link) { kfree(link); } /* Handle the following two cases: * case 1: link != NULL, prog != NULL, old != NULL * case 2: link != NULL, prog != NULL, old == NULL */ static int sock_map_link_update_prog(struct bpf_link *link, struct bpf_prog *prog, struct bpf_prog *old) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); struct bpf_prog **pprog, *old_link_prog; struct bpf_link **plink; int ret = 0; mutex_lock(&sockmap_mutex); /* If old prog is not NULL, ensure old prog is the same as link->prog. */ if (old && link->prog != old) { ret = -EPERM; goto out; } /* Ensure link->prog has the same type/attach_type as the new prog. */ if (link->prog->type != prog->type || link->prog->expected_attach_type != prog->expected_attach_type) { ret = -EINVAL; goto out; } if (!sockmap_link->map) { ret = -ENOLINK; goto out; } ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, link->attach_type); if (ret) goto out; /* return error if the stored bpf_link does not match the incoming bpf_link. */ if (link != *plink) { ret = -EBUSY; goto out; } if (old) { ret = psock_replace_prog(pprog, prog, old); if (ret) goto out; } else { psock_set_prog(pprog, prog); } bpf_prog_inc(prog); old_link_prog = xchg(&link->prog, prog); bpf_prog_put(old_link_prog); out: mutex_unlock(&sockmap_mutex); return ret; } static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) { u32 map_id = 0; mutex_lock(&sockmap_mutex); if (sockmap_link->map) map_id = sockmap_link->map->id; mutex_unlock(&sockmap_mutex); return map_id; } static int sock_map_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); info->sockmap.map_id = map_id; info->sockmap.attach_type = link->attach_type; return 0; } static void sock_map_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); seq_printf(seq, "map_id:\t%u\n", map_id); seq_printf(seq, "attach_type:\t%u\n", link->attach_type); } static const struct bpf_link_ops sock_map_link_ops = { .release = sock_map_link_release, .dealloc = sock_map_link_dealloc, .detach = sock_map_link_detach, .update_prog = sock_map_link_update_prog, .fill_link_info = sock_map_link_fill_info, .show_fdinfo = sock_map_link_show_fdinfo, }; int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct sockmap_link *sockmap_link; enum bpf_attach_type attach_type; struct bpf_map *map; int ret; if (attr->link_create.flags) return -EINVAL; map = bpf_map_get_with_uref(attr->link_create.target_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { ret = -EINVAL; goto out; } sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); if (!sockmap_link) { ret = -ENOMEM; goto out; } attach_type = attr->link_create.attach_type; bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, attach_type); sockmap_link->map = map; ret = bpf_link_prime(&sockmap_link->link, &link_primer); if (ret) { kfree(sockmap_link); goto out; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); mutex_unlock(&sockmap_mutex); if (ret) { bpf_link_cleanup(&link_primer); goto out; } /* Increase refcnt for the prog since when old prog is replaced with * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. * * Actually, we do not need to increase refcnt for the prog since bpf_link * will hold a reference. But in order to have less complexity w.r.t. * replacing/setting prog, let us increase the refcnt to make things simpler. */ bpf_prog_inc(prog); return bpf_link_settle(&link_primer); out: bpf_map_put_with_uref(map); return ret; } static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init);
49 64 8 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright (C) 2022, 2024 Intel Corporation */ #ifndef IEEE80211_RATE_H #define IEEE80211_RATE_H #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" struct rate_control_ref { const struct rate_control_ops *ops; void *priv; }; void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc); void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st); void rate_control_rate_init(struct link_sta_info *link_sta); void rate_control_rate_init_all_links(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, struct sta_info *sta, gfp_t gfp) { spin_lock_init(&sta->rate_ctrl_lock); return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp); } static inline void rate_control_free_sta(struct sta_info *sta) { struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; ref->ops->free_sta(ref->priv, ista, priv_sta); } static inline void rate_control_add_sta_debugfs(struct sta_info *sta) { #ifdef CONFIG_MAC80211_DEBUGFS struct rate_control_ref *ref = sta->rate_ctrl; if (ref && sta->debugfs_dir && ref->ops->add_sta_debugfs) ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv, sta->debugfs_dir); #endif } extern const struct debugfs_short_fops rcname_ops; static inline void rate_control_add_debugfs(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfsdir; if (!local->rate_ctrl) return; if (!local->rate_ctrl->ops->add_debugfs) return; debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); local->debugfs.rcdir = debugfsdir; debugfs_create_file("name", 0400, debugfsdir, local->rate_ctrl, &rcname_ops); local->rate_ctrl->ops->add_debugfs(&local->hw, local->rate_ctrl->priv, debugfsdir); #endif } void ieee80211_check_rate_mask(struct ieee80211_link_data *link); /* Get a reference to the rate control algorithm. If `name' is NULL, get the * first available algorithm. */ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name); void rate_control_deinitialize(struct ieee80211_local *local); /* Rate control algorithms */ #ifdef CONFIG_MAC80211_RC_MINSTREL int rc80211_minstrel_init(void); void rc80211_minstrel_exit(void); #else static inline int rc80211_minstrel_init(void) { return 0; } static inline void rc80211_minstrel_exit(void) { } #endif #endif /* IEEE80211_RATE_H */
10 10 2 2 10 10 9 2 2 9 6 9 9 2 9 9 10 9 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 // SPDX-License-Identifier: GPL-2.0-only /* * net/psample/psample.c - Netlink channel for packet sampling * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/timekeeping.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/psample.h> #include <linux/spinlock.h> #include <net/ip_tunnels.h> #include <net/dst_metadata.h> #define PSAMPLE_MAX_PACKET_SIZE 0xffff static LIST_HEAD(psample_groups_list); static DEFINE_SPINLOCK(psample_groups_lock); /* multicast groups */ enum psample_nl_multicast_groups { PSAMPLE_NL_MCGRP_CONFIG, PSAMPLE_NL_MCGRP_SAMPLE, }; static const struct genl_multicast_group psample_nl_mcgrps[] = { [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; static struct genl_family psample_nl_family __ro_after_init; static int psample_group_nl_fill(struct sk_buff *msg, struct psample_group *group, enum psample_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int ret; hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); if (ret < 0) goto error; ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount); if (ret < 0) goto error; ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq); if (ret < 0) goto error; genlmsg_end(msg, hdr); return 0; error: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct psample_group *group; int start = cb->args[0]; int idx = 0; int err; spin_lock_bh(&psample_groups_lock); list_for_each_entry(group, &psample_groups_list, list) { if (!net_eq(group->net, sock_net(msg->sk))) continue; if (idx < start) { idx++; continue; } err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) break; idx++; } spin_unlock_bh(&psample_groups_lock); cb->args[0] = idx; return msg->len; } static const struct genl_small_ops psample_nl_ops[] = { { .cmd = PSAMPLE_CMD_GET_GROUP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = psample_nl_cmd_get_group_dumpit, /* can be retrieved by unprivileged users */ } }; static struct genl_family psample_nl_family __ro_after_init = { .name = PSAMPLE_GENL_NAME, .version = PSAMPLE_GENL_VERSION, .maxattr = PSAMPLE_ATTR_MAX, .netnsok = true, .module = THIS_MODULE, .mcgrps = psample_nl_mcgrps, .small_ops = psample_nl_ops, .n_small_ops = ARRAY_SIZE(psample_nl_ops), .resv_start_op = PSAMPLE_CMD_GET_GROUP + 1, .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps), }; static void psample_group_notify(struct psample_group *group, enum psample_command cmd) { struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return; err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI); if (!err) genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0, PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC); else nlmsg_free(msg); } static struct psample_group *psample_group_create(struct net *net, u32 group_num) { struct psample_group *group; group = kzalloc(sizeof(*group), GFP_ATOMIC); if (!group) return NULL; group->net = net; group->group_num = group_num; list_add_tail(&group->list, &psample_groups_list); psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP); return group; } static void psample_group_destroy(struct psample_group *group) { psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP); list_del(&group->list); kfree_rcu(group, rcu); } static struct psample_group * psample_group_lookup(struct net *net, u32 group_num) { struct psample_group *group; list_for_each_entry(group, &psample_groups_list, list) if ((group->group_num == group_num) && (group->net == net)) return group; return NULL; } struct psample_group *psample_group_get(struct net *net, u32 group_num) { struct psample_group *group; spin_lock_bh(&psample_groups_lock); group = psample_group_lookup(net, group_num); if (!group) { group = psample_group_create(net, group_num); if (!group) goto out; } group->refcount++; out: spin_unlock_bh(&psample_groups_lock); return group; } EXPORT_SYMBOL_GPL(psample_group_get); void psample_group_take(struct psample_group *group) { spin_lock_bh(&psample_groups_lock); group->refcount++; spin_unlock_bh(&psample_groups_lock); } EXPORT_SYMBOL_GPL(psample_group_take); void psample_group_put(struct psample_group *group) { spin_lock_bh(&psample_groups_lock); if (--group->refcount == 0) psample_group_destroy(group); spin_unlock_bh(&psample_groups_lock); } EXPORT_SYMBOL_GPL(psample_group_put); #ifdef CONFIG_INET static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { unsigned short tun_proto = ip_tunnel_info_af(tun_info); const void *tun_opts = ip_tunnel_info_opts(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags) && nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, PSAMPLE_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE)) return -EMSGSIZE; switch (tun_proto) { case AF_INET: if (tun_key->u.ipv4.src && nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->u.ipv4.src)) return -EMSGSIZE; if (tun_key->u.ipv4.dst && nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->u.ipv4.dst)) return -EMSGSIZE; break; case AF_INET6: if (!ipv6_addr_any(&tun_key->u.ipv6.src) && nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, &tun_key->u.ipv6.src)) return -EMSGSIZE; if (!ipv6_addr_any(&tun_key->u.ipv6.dst) && nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, &tun_key->u.ipv6.dst)) return -EMSGSIZE; break; } if (tun_key->tos && nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TOS, tun_key->tos)) return -EMSGSIZE; if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; if (tun_key->tp_src && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, tun_key->tp_src)) return -EMSGSIZE; if (tun_key->tp_dst && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; if (tun_opts_len) { if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; } return 0; } static int psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct nlattr *nla; int err; nla = nla_nest_start_noflag(skb, PSAMPLE_ATTR_TUNNEL); if (!nla) return -EMSGSIZE; err = __psample_ip_tun_to_nlattr(skb, tun_info); if (err) { nla_nest_cancel(skb, nla); return err; } nla_nest_end(skb, nla); return 0; } static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) { unsigned short tun_proto = ip_tunnel_info_af(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; int sum = nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */ if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags)) sum += nla_total_size_64bit(sizeof(u64)); if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) sum += nla_total_size(0); switch (tun_proto) { case AF_INET: if (tun_key->u.ipv4.src) sum += nla_total_size(sizeof(u32)); if (tun_key->u.ipv4.dst) sum += nla_total_size(sizeof(u32)); break; case AF_INET6: if (!ipv6_addr_any(&tun_key->u.ipv6.src)) sum += nla_total_size(sizeof(struct in6_addr)); if (!ipv6_addr_any(&tun_key->u.ipv6.dst)) sum += nla_total_size(sizeof(struct in6_addr)); break; } if (tun_key->tos) sum += nla_total_size(sizeof(u8)); sum += nla_total_size(sizeof(u8)); /* TTL */ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_key->tp_src) sum += nla_total_size(sizeof(u16)); if (tun_key->tp_dst) sum += nla_total_size(sizeof(u16)); if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_opts_len) { if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); } return sum; } #endif void psample_sample_packet(struct psample_group *group, const struct sk_buff *skb, u32 sample_rate, const struct psample_metadata *md) { ktime_t tstamp = ktime_get_real(); int out_ifindex = md->out_ifindex; int in_ifindex = md->in_ifindex; u32 trunc_size = md->trunc_size; #ifdef CONFIG_INET struct ip_tunnel_info *tun_info; #endif struct sk_buff *nl_skb; int data_len; int meta_len; void *data; int ret; if (!genl_has_listeners(&psample_nl_family, group->net, PSAMPLE_NL_MCGRP_SAMPLE)) return; meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) + (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + nla_total_size(sizeof(u32)) + /* sample_rate */ nla_total_size(sizeof(u32)) + /* orig_size */ nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)) + /* seq */ nla_total_size_64bit(sizeof(u64)) + /* timestamp */ nla_total_size(sizeof(u16)) + /* protocol */ (md->user_cookie_len ? nla_total_size(md->user_cookie_len) : 0) + /* user cookie */ (md->rate_as_probability ? nla_total_size(0) : 0); /* rate as probability */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); if (tun_info) meta_len += psample_tunnel_meta_len(tun_info); #endif data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN - NLA_ALIGNTO; nl_skb = genlmsg_new(meta_len + nla_total_size(data_len), GFP_ATOMIC); if (unlikely(!nl_skb)) return; data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0, PSAMPLE_CMD_SAMPLE); if (unlikely(!data)) goto error; if (in_ifindex) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex); if (unlikely(ret < 0)) goto error; } if (out_ifindex) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex); if (unlikely(ret < 0)) goto error; } ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++); if (unlikely(ret < 0)) goto error; if (md->out_tc_valid) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc); if (unlikely(ret < 0)) goto error; } if (md->out_tc_occ_valid) { ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC, md->out_tc_occ, PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; } if (md->latency_valid) { ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY, md->latency, PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; } ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP, ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO, be16_to_cpu(skb->protocol)); if (unlikely(ret < 0)) goto error; if (data_len) { int nla_len = nla_total_size(data_len); struct nlattr *nla; nla = skb_put(nl_skb, nla_len); nla->nla_type = PSAMPLE_ATTR_DATA; nla->nla_len = nla_attr_size(data_len); if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) goto error; } #ifdef CONFIG_INET if (tun_info) { ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); if (unlikely(ret < 0)) goto error; } #endif if (md->user_cookie && md->user_cookie_len && nla_put(nl_skb, PSAMPLE_ATTR_USER_COOKIE, md->user_cookie_len, md->user_cookie)) goto error; if (md->rate_as_probability && nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY)) goto error; genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); return; error: pr_err_ratelimited("Could not create psample log message\n"); nlmsg_free(nl_skb); } EXPORT_SYMBOL_GPL(psample_sample_packet); static int __init psample_module_init(void) { return genl_register_family(&psample_nl_family); } static void __exit psample_module_exit(void) { genl_unregister_family(&psample_nl_family); } module_init(psample_module_init); module_exit(psample_module_exit); MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>"); MODULE_DESCRIPTION("netlink channel for packet sampling"); MODULE_LICENSE("GPL v2");
1 21 3 19 20 4 17 17 2 17 3 15 15 15 3 18 15 8 2 2 4 6 14 14 20 21 2 2 21 20 20 21 34 1 10 28 8 8 20 10 23 6 23 4 20 9 6 2 23 1 23 23 20 8 2 2 2 3 2 38 1 35 35 3 20 12 40 8 33 40 36 35 35 20 20 1 1 2 1 1 1 1 35 14 34 3 35 35 1 34 34 8 35 1 21 12 4 7 1 1 13 9 4 11 2 10 3 11 2 11 2 11 2 10 3 11 2 11 2 11 2 11 2 6 6 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <linux/unaligned.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ static const char *const tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", "ESTABLISHED", "FIN_WAIT", "CLOSE_WAIT", "LAST_ACK", "TIME_WAIT", "CLOSE", "SYN_SENT2", }; enum nf_ct_tcp_action { NFCT_TCP_IGNORE, NFCT_TCP_INVALID, NFCT_TCP_ACCEPT, }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, [TCP_CONNTRACK_SYN_RECV] = 60 SECS, [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, [TCP_CONNTRACK_LAST_ACK] = 30 SECS, [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE] = 10 SECS, [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ [TCP_CONNTRACK_RETRANS] = 5 MINS, [TCP_CONNTRACK_UNACK] = 5 MINS, }; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT #define sSR TCP_CONNTRACK_SYN_RECV #define sES TCP_CONNTRACK_ESTABLISHED #define sFW TCP_CONNTRACK_FIN_WAIT #define sCW TCP_CONNTRACK_CLOSE_WAIT #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE #define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE /* What TCP flags are set from RST/SYN/FIN/ACK. */ enum tcp_bit_set { TCP_SYN_SET, TCP_SYNACK_SET, TCP_FIN_SET, TCP_ACK_SET, TCP_RST_SET, TCP_NONE_SET, }; /* * The TCP state transition table needs a few words... * * We are the man in the middle. All the packets go through us * but might get lost in transit to the destination. * It is assumed that the destinations can't receive segments * we haven't seen. * * The checked segment is in window, but our windows are *not* * equivalent with the ones of the sender/receiver. We always * try to guess the state of the current sender. * * The meaning of the states are: * * NONE: initial state * SYN_SENT: SYN-only packet seen * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen * CLOSE: closed connection (RST) * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid * and the receiver may send back a connection * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): * if we regard them as truly invalid packets */ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN * sS2 -> sS2 Late retransmitted SYN * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. * Or we are not in sync and hold a dead connection. * sFW -> sIG * sCW -> sIG * sLA -> sIG * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open * sES -> sIV Invalid SYN/ACK packets sent by the client * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for * the last ACK. * Migth be a retransmitted FIN as well... * sCW -> sLA * sLA -> sLA Retransmitted FIN. Remain in the same state. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 }, /* * sNO -> sIV Never reached. * sSS -> sS2 Simultaneous open * sS2 -> sS2 Retransmitted simultaneous SYN * sSR -> sIV Invalid SYN packets sent by the server * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sSS Reopened connection, but server may have switched role * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. * sS2 -> sSR Simultaneous open * sSR -> sIG Retransmitted SYN/ACK, ignore it. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN * sCW -> sIG * sLA -> sIG * sTW -> sIG * sCL -> sIG */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. * sCW -> sLA * sLA -> sLA Retransmitted FIN. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) return; seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } #endif static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); else if (tcph->fin) return TCP_FIN_SET; else if (tcph->ack) return TCP_ACK_SET; else return TCP_NONE_SET; } /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering in IP Filter' by Guido van Rooij. http://www.sane.nl/events/sane2000/papers.html http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ The boundaries and the conditions are changed according to RFC793: the packet must intersect the window (i.e. segments may be after the right or before the left edge) and thus receivers may ACK segments after the right edge of the window. td_maxend = max(sack + max(win,1)) seen in reply packets td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets td_maxwin += seq + len - sender.td_maxend if seq + len > sender.td_maxend td_end = max(seq + len) seen in sent packets I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin III. Upper bound for valid (s)ack: sack <= receiver.td_end IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW where sack is the highest right edge of sack block found in the packet or ack in the case of packet without SACK option. The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, unsigned int dataoff, const struct tcphdr *tcph) { /* XXX Should I use payload length field in IP/IPv6 header ? * - YK */ return (seq + len - dataoff - tcph->doff*4 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); } /* Fixme: what about big packets? */ #define MAXACKWINCONST 66000 #define MAXACKWINDOW(sender) \ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ : MAXACKWINCONST) /* * Simplified tcp_parse_options routine from tcp_input.c */ static void tcp_options(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff);