| 27 27 27 29 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 6 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3 5 5 5 5 2 2 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 26 26 26 26 26 26 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2004-2007 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. */ #include <linux/completion.h> #include <linux/dma-mapping.h> #include <linux/device.h> #include <linux/module.h> #include <linux/err.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/workqueue.h> #include <linux/kdev_t.h> #include <linux/etherdevice.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sysfs.h> #include "cm_msgs.h" #include "core_priv.h" #include "cm_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); #define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */ #define CM_DIRECT_RETRY_CTX ((void *) 1UL) #define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */ static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_NO_QP] = "no QP", [IB_CM_REJ_NO_EEC] = "no EEC", [IB_CM_REJ_NO_RESOURCES] = "no resources", [IB_CM_REJ_TIMEOUT] = "timeout", [IB_CM_REJ_UNSUPPORTED] = "unsupported", [IB_CM_REJ_INVALID_COMM_ID] = "invalid comm ID", [IB_CM_REJ_INVALID_COMM_INSTANCE] = "invalid comm instance", [IB_CM_REJ_INVALID_SERVICE_ID] = "invalid service ID", [IB_CM_REJ_INVALID_TRANSPORT_TYPE] = "invalid transport type", [IB_CM_REJ_STALE_CONN] = "stale conn", [IB_CM_REJ_RDC_NOT_EXIST] = "RDC not exist", [IB_CM_REJ_INVALID_GID] = "invalid GID", [IB_CM_REJ_INVALID_LID] = "invalid LID", [IB_CM_REJ_INVALID_SL] = "invalid SL", [IB_CM_REJ_INVALID_TRAFFIC_CLASS] = "invalid traffic class", [IB_CM_REJ_INVALID_HOP_LIMIT] = "invalid hop limit", [IB_CM_REJ_INVALID_PACKET_RATE] = "invalid packet rate", [IB_CM_REJ_INVALID_ALT_GID] = "invalid alt GID", [IB_CM_REJ_INVALID_ALT_LID] = "invalid alt LID", [IB_CM_REJ_INVALID_ALT_SL] = "invalid alt SL", [IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS] = "invalid alt traffic class", [IB_CM_REJ_INVALID_ALT_HOP_LIMIT] = "invalid alt hop limit", [IB_CM_REJ_INVALID_ALT_PACKET_RATE] = "invalid alt packet rate", [IB_CM_REJ_PORT_CM_REDIRECT] = "port CM redirect", [IB_CM_REJ_PORT_REDIRECT] = "port redirect", [IB_CM_REJ_INVALID_MTU] = "invalid MTU", [IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES] = "insufficient resp resources", [IB_CM_REJ_CONSUMER_DEFINED] = "consumer defined", [IB_CM_REJ_INVALID_RNR_RETRY] = "invalid RNR retry", [IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID] = "duplicate local comm ID", [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version", [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label", [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label", [IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED] = "vendor option is not supported", }; const char *__attribute_const__ ibcm_reject_msg(int reason) { size_t index = reason; if (index < ARRAY_SIZE(ibcm_rej_reason_strs) && ibcm_rej_reason_strs[index]) return ibcm_rej_reason_strs[index]; else return "unrecognized reason"; } EXPORT_SYMBOL(ibcm_reject_msg); struct cm_id_private; struct cm_work; static int cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device, void *client_data); static void cm_process_work(struct cm_id_private *cm_id_priv, struct cm_work *work); static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param); static void cm_issue_dreq(struct cm_id_private *cm_id_priv); static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, void *private_data, u8 private_data_len); static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len); static struct ib_client cm_client = { .name = "cm", .add = cm_add_one, .remove = cm_remove_one }; static struct ib_cm { spinlock_t lock; struct list_head device_list; rwlock_t device_lock; struct rb_root listen_service_table; u64 listen_service_id; /* struct rb_root peer_service_table; todo: fix peer to peer */ struct rb_root remote_qp_table; struct rb_root remote_id_table; struct rb_root remote_sidr_table; struct xarray local_id_table; u32 local_id_next; __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; } cm; /* Counter indexes ordered by attribute ID */ enum { CM_REQ_COUNTER, CM_MRA_COUNTER, CM_REJ_COUNTER, CM_REP_COUNTER, CM_RTU_COUNTER, CM_DREQ_COUNTER, CM_DREP_COUNTER, CM_SIDR_REQ_COUNTER, CM_SIDR_REP_COUNTER, CM_LAP_COUNTER, CM_APR_COUNTER, CM_ATTR_COUNT, CM_ATTR_ID_OFFSET = 0x0010, }; enum { CM_XMIT, CM_XMIT_RETRIES, CM_RECV, CM_RECV_DUPLICATES, CM_COUNTER_GROUPS }; struct cm_counter_attribute { struct ib_port_attribute attr; unsigned short group; unsigned short index; }; struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; struct ib_mad_agent *rep_agent; u32 port_num; atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT]; }; struct cm_device { struct kref kref; struct list_head list; rwlock_t mad_agent_lock; struct ib_device *ib_device; u8 ack_delay; int going_down; struct cm_port *port[]; }; struct cm_av { struct cm_port *port; struct rdma_ah_attr ah_attr; u16 dlid_datapath; u16 pkey_index; u8 timeout; }; struct cm_work { struct delayed_work work; struct list_head list; struct cm_port *port; struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */ __be32 local_id; /* Established / timewait */ __be32 remote_id; struct ib_cm_event cm_event; struct sa_path_rec path[]; }; struct cm_timewait_info { struct cm_work work; struct list_head list; struct rb_node remote_qp_node; struct rb_node remote_id_node; __be64 remote_ca_guid; __be32 remote_qpn; u8 inserted_remote_qp; u8 inserted_remote_id; }; struct cm_id_private { struct ib_cm_id id; struct rb_node service_node; struct rb_node sidr_id_node; u32 sidr_slid; spinlock_t lock; /* Do not acquire inside cm.lock */ struct completion comp; refcount_t refcount; /* Number of clients sharing this ib_cm_id. Only valid for listeners. * Protected by the cm.lock spinlock. */ int listen_sharecount; struct rcu_head rcu; struct ib_mad_send_buf *msg; struct cm_timewait_info *timewait_info; /* todo: use alternate port on send failure */ struct cm_av av; struct cm_av alt_av; void *private_data; __be64 tid; __be32 local_qpn; __be32 remote_qpn; enum ib_qp_type qp_type; __be32 sq_psn; __be32 rq_psn; int timeout_ms; enum ib_mtu path_mtu; __be16 pkey; u8 private_data_len; u8 max_cm_retries; u8 responder_resources; u8 initiator_depth; u8 retry_count; u8 rnr_retry_count; u8 target_ack_delay; struct list_head work_list; atomic_t work_count; struct rdma_ucm_ece ece; }; static void cm_dev_release(struct kref *kref) { struct cm_device *cm_dev = container_of(kref, struct cm_device, kref); u32 i; rdma_for_each_port(cm_dev->ib_device, i) kfree(cm_dev->port[i - 1]); kfree(cm_dev); } static void cm_device_put(struct cm_device *cm_dev) { kref_put(&cm_dev->kref, cm_dev_release); } static void cm_work_handler(struct work_struct *work); static inline void cm_deref_id(struct cm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) complete(&cm_id_priv->comp); } static struct ib_mad_send_buf * cm_alloc_msg_agent(struct cm_id_private *cm_id_priv, bool rep_agent) { struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; lockdep_assert_held(&cm_id_priv->lock); if (!cm_id_priv->av.port) return ERR_PTR(-EINVAL); read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); mad_agent = rep_agent ? cm_id_priv->av.port->rep_agent : cm_id_priv->av.port->mad_agent; if (!mad_agent) { m = ERR_PTR(-EINVAL); goto out; } ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0); if (IS_ERR(ah)) { m = ERR_CAST(ah); goto out; } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, cm_id_priv->av.pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { rdma_destroy_ah(ah, 0); goto out; } m->ah = ah; out: read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return m; } static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) { return cm_alloc_msg_agent(cm_id_priv, false); } static void cm_free_msg(struct ib_mad_send_buf *msg) { if (msg->ah) rdma_destroy_ah(msg->ah, 0); ib_free_send_mad(msg); } static struct ib_mad_send_buf * cm_alloc_priv_msg_rep(struct cm_id_private *cm_id_priv, enum ib_cm_state state, bool rep_agent) { struct ib_mad_send_buf *msg; lockdep_assert_held(&cm_id_priv->lock); msg = cm_alloc_msg_agent(cm_id_priv, rep_agent); if (IS_ERR(msg)) return msg; cm_id_priv->msg = msg; refcount_inc(&cm_id_priv->refcount); msg->context[0] = cm_id_priv; msg->context[1] = (void *) (unsigned long) state; msg->retries = cm_id_priv->max_cm_retries; msg->timeout_ms = cm_id_priv->timeout_ms; return msg; } static struct ib_mad_send_buf * cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) { return cm_alloc_priv_msg_rep(cm_id_priv, state, false); } static void cm_free_priv_msg(struct ib_mad_send_buf *msg) { struct cm_id_private *cm_id_priv = msg->context[0]; lockdep_assert_held(&cm_id_priv->lock); if (!WARN_ON(cm_id_priv->msg != msg)) cm_id_priv->msg = NULL; if (msg->ah) rdma_destroy_ah(msg->ah, 0); cm_deref_id(cm_id_priv); ib_free_send_mad(msg); } static struct ib_mad_send_buf * cm_alloc_response_msg_no_ah(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, bool direct_retry) { struct ib_mad_send_buf *m; m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (!IS_ERR(m)) m->context[0] = direct_retry ? CM_DIRECT_RETRY_CTX : NULL; return m; } static int cm_create_response_msg_ah(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, struct ib_mad_send_buf *msg) { struct ib_ah *ah; ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, port->port_num); if (IS_ERR(ah)) return PTR_ERR(ah); msg->ah = ah; return 0; } static int cm_alloc_response_msg(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, bool direct_retry, struct ib_mad_send_buf **msg) { struct ib_mad_send_buf *m; int ret; m = cm_alloc_response_msg_no_ah(port, mad_recv_wc, direct_retry); if (IS_ERR(m)) return PTR_ERR(m); ret = cm_create_response_msg_ah(port, mad_recv_wc, m); if (ret) { ib_free_send_mad(m); return ret; } *msg = m; return 0; } static void *cm_copy_private_data(const void *private_data, u8 private_data_len) { void *data; if (!private_data || !private_data_len) return NULL; data = kmemdup(private_data, private_data_len, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); return data; } static void cm_set_private_data(struct cm_id_private *cm_id_priv, void *private_data, u8 private_data_len) { if (cm_id_priv->private_data && cm_id_priv->private_data_len) kfree(cm_id_priv->private_data); cm_id_priv->private_data = private_data; cm_id_priv->private_data_len = private_data_len; } static void cm_set_av_port(struct cm_av *av, struct cm_port *port) { struct cm_port *old_port = av->port; if (old_port == port) return; av->port = port; if (old_port) cm_device_put(old_port->cm_dev); if (port) kref_get(&port->cm_dev->kref); } static void cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, struct rdma_ah_attr *ah_attr, struct cm_av *av) { cm_set_av_port(av, port); av->pkey_index = wc->pkey_index; rdma_move_ah_attr(&av->ah_attr, ah_attr); } static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, struct ib_grh *grh, struct cm_av *av) { cm_set_av_port(av, port); av->pkey_index = wc->pkey_index; return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, port->port_num, wc, grh, &av->ah_attr); } static struct cm_port * get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) { struct cm_device *cm_dev; struct cm_port *port = NULL; unsigned long flags; if (attr) { read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (cm_dev->ib_device == attr->device) { port = cm_dev->port[attr->port_num - 1]; break; } } read_unlock_irqrestore(&cm.device_lock, flags); } else { /* SGID attribute can be NULL in following * conditions. * (a) Alternative path * (b) IB link layer without GRH * (c) LAP send messages */ read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { attr = rdma_find_gid(cm_dev->ib_device, &path->sgid, sa_conv_pathrec_to_gid_type(path), NULL); if (!IS_ERR(attr)) { port = cm_dev->port[attr->port_num - 1]; break; } } read_unlock_irqrestore(&cm.device_lock, flags); if (port) rdma_put_gid_attr(attr); } return port; } static int cm_init_av_by_path(struct sa_path_rec *path, const struct ib_gid_attr *sgid_attr, struct cm_av *av) { struct rdma_ah_attr new_ah_attr; struct cm_device *cm_dev; struct cm_port *port; int ret; port = get_cm_port_from_path(path, sgid_attr); if (!port) return -EINVAL; cm_dev = port->cm_dev; ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, be16_to_cpu(path->pkey), &av->pkey_index); if (ret) return ret; cm_set_av_port(av, port); /* * av->ah_attr might be initialized based on wc or during * request processing time which might have reference to sgid_attr. * So initialize a new ah_attr on stack. * If initialization fails, old ah_attr is used for sending any * responses. If initialization is successful, than new ah_attr * is used by overwriting the old one. So that right ah_attr * can be used to return an error response. */ ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path, &new_ah_attr, sgid_attr); if (ret) return ret; av->timeout = path->packet_life_time + 1; rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; } /* Move av created by cm_init_av_by_path(), so av.dgid is not moved */ static void cm_move_av_from_path(struct cm_av *dest, struct cm_av *src) { cm_set_av_port(dest, src->port); cm_set_av_port(src, NULL); dest->pkey_index = src->pkey_index; rdma_move_ah_attr(&dest->ah_attr, &src->ah_attr); dest->timeout = src->timeout; } static void cm_destroy_av(struct cm_av *av) { rdma_destroy_ah_attr(&av->ah_attr); cm_set_av_port(av, NULL); } static u32 cm_local_id(__be32 local_id) { return (__force u32) (local_id ^ cm.random_id_operand); } static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; rcu_read_lock(); cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id)); if (!cm_id_priv || cm_id_priv->id.remote_id != remote_id || !refcount_inc_not_zero(&cm_id_priv->refcount)) cm_id_priv = NULL; rcu_read_unlock(); return cm_id_priv; } /* * Trivial helpers to strip endian annotation and compare; the * endianness doesn't actually matter since we just need a stable * order for the RB tree. */ static int be32_lt(__be32 a, __be32 b) { return (__force u32) a < (__force u32) b; } static int be32_gt(__be32 a, __be32 b) { return (__force u32) a > (__force u32) b; } static int be64_lt(__be64 a, __be64 b) { return (__force u64) a < (__force u64) b; } static int be64_gt(__be64 a, __be64 b) { return (__force u64) a > (__force u64) b; } /* * Inserts a new cm_id_priv into the listen_service_table. Returns cm_id_priv * if the new ID was inserted, NULL if it could not be inserted due to a * collision, or the existing cm_id_priv ready for shared usage. */ static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv, ib_cm_handler shared_handler) { struct rb_node **link = &cm.listen_service_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; unsigned long flags; spin_lock_irqsave(&cm.lock, flags); while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); if (cm_id_priv->id.device < cur_cm_id_priv->id.device) link = &(*link)->rb_left; else if (cm_id_priv->id.device > cur_cm_id_priv->id.device) link = &(*link)->rb_right; else if (be64_lt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_left; else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; else { /* * Sharing an ib_cm_id with different handlers is not * supported */ if (cur_cm_id_priv->id.cm_handler != shared_handler || cur_cm_id_priv->id.context || WARN_ON(!cur_cm_id_priv->id.cm_handler)) { spin_unlock_irqrestore(&cm.lock, flags); return NULL; } refcount_inc(&cur_cm_id_priv->refcount); cur_cm_id_priv->listen_sharecount++; spin_unlock_irqrestore(&cm.lock, flags); return cur_cm_id_priv; } } cm_id_priv->listen_sharecount++; rb_link_node(&cm_id_priv->service_node, parent, link); rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table); spin_unlock_irqrestore(&cm.lock, flags); return cm_id_priv; } static struct cm_id_private *cm_find_listen(struct ib_device *device, __be64 service_id) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); if (device < cm_id_priv->id.device) node = node->rb_left; else if (device > cm_id_priv->id.device) node = node->rb_right; else if (be64_lt(service_id, cm_id_priv->id.service_id)) node = node->rb_left; else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; else { refcount_inc(&cm_id_priv->refcount); return cm_id_priv; } } return NULL; } static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_id_table.rb_node; struct rb_node *parent = NULL; struct cm_timewait_info *cur_timewait_info; __be64 remote_ca_guid = timewait_info->remote_ca_guid; __be32 remote_id = timewait_info->work.remote_id; while (*link) { parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_id_node); if (be32_lt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_left; else if (be32_gt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_right; else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; } timewait_info->inserted_remote_id = 1; rb_link_node(&timewait_info->remote_id_node, parent, link); rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table); return NULL; } static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid, __be32 remote_id) { struct rb_node *node = cm.remote_id_table.rb_node; struct cm_timewait_info *timewait_info; struct cm_id_private *res = NULL; spin_lock_irq(&cm.lock); while (node) { timewait_info = rb_entry(node, struct cm_timewait_info, remote_id_node); if (be32_lt(remote_id, timewait_info->work.remote_id)) node = node->rb_left; else if (be32_gt(remote_id, timewait_info->work.remote_id)) node = node->rb_right; else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_left; else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_right; else { res = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); break; } } spin_unlock_irq(&cm.lock); return res; } static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_qp_table.rb_node; struct rb_node *parent = NULL; struct cm_timewait_info *cur_timewait_info; __be64 remote_ca_guid = timewait_info->remote_ca_guid; __be32 remote_qpn = timewait_info->remote_qpn; while (*link) { parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_qp_node); if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_left; else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_right; else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; } timewait_info->inserted_remote_qp = 1; rb_link_node(&timewait_info->remote_qp_node, parent, link); rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table); return NULL; } static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.remote_sidr_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; __be32 remote_id = cm_id_priv->id.remote_id; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, sidr_id_node); if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_left; else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_right; else { if (cur_cm_id_priv->sidr_slid < cm_id_priv->sidr_slid) link = &(*link)->rb_left; else if (cur_cm_id_priv->sidr_slid > cm_id_priv->sidr_slid) link = &(*link)->rb_right; else return cur_cm_id_priv; } } rb_link_node(&cm_id_priv->sidr_id_node, parent, link); rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); return NULL; } static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device, ib_cm_handler cm_handler, void *context) { struct cm_id_private *cm_id_priv; u32 id; int ret; cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->id.state = IB_CM_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.remote_cm_qpn = 1; RB_CLEAR_NODE(&cm_id_priv->service_node); RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); atomic_set(&cm_id_priv->work_count, -1); refcount_set(&cm_id_priv->refcount, 1); ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b, &cm.local_id_next, GFP_KERNEL); if (ret < 0) goto error; cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; return cm_id_priv; error: kfree(cm_id_priv); return ERR_PTR(ret); } /* * Make the ID visible to the MAD handlers and other threads that use the * xarray. */ static void cm_finalize_id(struct cm_id_private *cm_id_priv) { xa_store(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id), cm_id_priv, GFP_ATOMIC); } struct ib_cm_id *ib_create_cm_id(struct ib_device *device, ib_cm_handler cm_handler, void *context) { struct cm_id_private *cm_id_priv; cm_id_priv = cm_alloc_id_priv(device, cm_handler, context); if (IS_ERR(cm_id_priv)) return ERR_CAST(cm_id_priv); cm_finalize_id(cm_id_priv); return &cm_id_priv->id; } EXPORT_SYMBOL(ib_create_cm_id); static struct cm_work *cm_dequeue_work(struct cm_id_private *cm_id_priv) { struct cm_work *work; if (list_empty(&cm_id_priv->work_list)) return NULL; work = list_entry(cm_id_priv->work_list.next, struct cm_work, list); list_del(&work->list); return work; } static void cm_free_work(struct cm_work *work) { if (work->mad_recv_wc) ib_free_recv_mad(work->mad_recv_wc); kfree(work); } static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv, struct cm_work *work) __releases(&cm_id_priv->lock) { bool immediate; /* * To deliver the event to the user callback we have the drop the * spinlock, however, we need to ensure that the user callback is single * threaded and receives events in the temporal order. If there are * already events being processed then thread new events onto a list, * the thread currently processing will pick them up. */ immediate = atomic_inc_and_test(&cm_id_priv->work_count); if (!immediate) { list_add_tail(&work->list, &cm_id_priv->work_list); /* * This routine always consumes incoming reference. Once queued * to the work_list then a reference is held by the thread * currently running cm_process_work() and this reference is not * needed. */ cm_deref_id(cm_id_priv); } spin_unlock_irq(&cm_id_priv->lock); if (immediate) cm_process_work(cm_id_priv, work); } static inline int cm_convert_to_ms(int iba_time) { /* approximate conversion to ms from 4.096us x 2^iba_time */ return 1 << max(iba_time - 8, 0); } /* * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time * Because of how ack_timeout is stored, adding one doubles the timeout. * To avoid large timeouts, select the max(ack_delay, life_time + 1), and * increment it (round up) only if the other is within 50%. */ static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) { int ack_timeout = packet_life_time + 1; if (ack_timeout >= ca_ack_delay) ack_timeout += (ca_ack_delay >= (ack_timeout - 1)); else ack_timeout = ca_ack_delay + (ack_timeout >= (ca_ack_delay - 1)); return min(31, ack_timeout); } static void cm_remove_remote(struct cm_id_private *cm_id_priv) { struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info; if (timewait_info->inserted_remote_id) { rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); timewait_info->inserted_remote_id = 0; } if (timewait_info->inserted_remote_qp) { rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table); timewait_info->inserted_remote_qp = 0; } } static struct cm_timewait_info *cm_create_timewait_info(__be32 local_id) { struct cm_timewait_info *timewait_info; timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); if (!timewait_info) return ERR_PTR(-ENOMEM); timewait_info->work.local_id = local_id; INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler); timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT; return timewait_info; } static void cm_enter_timewait(struct cm_id_private *cm_id_priv) { int wait_time; unsigned long flags; struct cm_device *cm_dev; lockdep_assert_held(&cm_id_priv->lock); cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); if (!cm_dev) return; spin_lock_irqsave(&cm.lock, flags); cm_remove_remote(cm_id_priv); list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); spin_unlock_irqrestore(&cm.lock, flags); /* * The cm_id could be destroyed by the user before we exit timewait. * To protect against this, we search for the cm_id after exiting * timewait before notifying the user that we've exited timewait. */ cm_id_priv->id.state = IB_CM_TIMEWAIT; wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); /* Check if the device started its remove_one */ spin_lock_irqsave(&cm.lock, flags); if (!cm_dev->going_down) queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, msecs_to_jiffies(wait_time)); spin_unlock_irqrestore(&cm.lock, flags); /* * The timewait_info is converted into a work and gets freed during * cm_free_work() in cm_timewait_handler(). */ BUILD_BUG_ON(offsetof(struct cm_timewait_info, work) != 0); cm_id_priv->timewait_info = NULL; } static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) { unsigned long flags; lockdep_assert_held(&cm_id_priv->lock); cm_id_priv->id.state = IB_CM_IDLE; if (cm_id_priv->timewait_info) { spin_lock_irqsave(&cm.lock, flags); cm_remove_remote(cm_id_priv); spin_unlock_irqrestore(&cm.lock, flags); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; } } static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id, enum ib_cm_state old_state) { struct cm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct cm_id_private, id); pr_err("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__, cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount)); } static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; enum ib_cm_state old_state; struct cm_work *work; int ret; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irq(&cm_id_priv->lock); old_state = cm_id->state; retest: switch (cm_id->state) { case IB_CM_LISTEN: spin_lock(&cm.lock); if (--cm_id_priv->listen_sharecount > 0) { /* The id is still shared. */ WARN_ON(refcount_read(&cm_id_priv->refcount) == 1); spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); cm_deref_id(cm_id_priv); return; } cm_id->state = IB_CM_IDLE; rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); RB_CLEAR_NODE(&cm_id_priv->service_node); spin_unlock(&cm.lock); break; case IB_CM_SIDR_REQ_SENT: cm_id->state = IB_CM_IDLE; ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_SIDR_REQ_RCVD: cm_send_sidr_rep_locked(cm_id_priv, &(struct ib_cm_sidr_rep_param){ .status = IB_SIDR_REJECT }); /* cm_send_sidr_rep_locked will not move to IDLE if it fails */ cm_id->state = IB_CM_IDLE; break; case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: ib_cancel_mad(cm_id_priv->msg); cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT, &cm_id_priv->id.device->node_guid, sizeof(cm_id_priv->id.device->node_guid), NULL, 0); break; case IB_CM_REQ_RCVD: if (err == -ENOMEM) { /* Do not reject to allow future retries. */ cm_reset_to_idle(cm_id_priv); } else { cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); } break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->msg); cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); goto retest; case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); break; case IB_CM_ESTABLISHED: if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) { cm_id->state = IB_CM_IDLE; break; } cm_issue_dreq(cm_id_priv); cm_enter_timewait(cm_id_priv); goto retest; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->msg); cm_enter_timewait(cm_id_priv); goto retest; case IB_CM_DREQ_RCVD: cm_send_drep_locked(cm_id_priv, NULL, 0); WARN_ON(cm_id->state != IB_CM_TIMEWAIT); goto retest; case IB_CM_TIMEWAIT: /* * The cm_acquire_id in cm_timewait_handler will stop working * once we do xa_erase below, so just move to idle here for * consistency. */ cm_id->state = IB_CM_IDLE; break; case IB_CM_IDLE: break; } WARN_ON(cm_id->state != IB_CM_IDLE); spin_lock(&cm.lock); /* Required for cleanup paths related cm_req_handler() */ if (cm_id_priv->timewait_info) { cm_remove_remote(cm_id_priv); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; } WARN_ON(cm_id_priv->listen_sharecount); WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node)); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id)); cm_deref_id(cm_id_priv); do { ret = wait_for_completion_timeout(&cm_id_priv->comp, msecs_to_jiffies( CM_DESTROY_ID_WAIT_TIMEOUT)); if (!ret) /* timeout happened */ cm_destroy_id_wait_timeout(cm_id, old_state); } while (!ret); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); cm_destroy_av(&cm_id_priv->av); cm_destroy_av(&cm_id_priv->alt_av); kfree(cm_id_priv->private_data); kfree_rcu(cm_id_priv, rcu); } void ib_destroy_cm_id(struct ib_cm_id *cm_id) { cm_destroy_id(cm_id, 0); } EXPORT_SYMBOL(ib_destroy_cm_id); static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id) { if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && (service_id != IB_CM_ASSIGN_SERVICE_ID)) return -EINVAL; if (service_id == IB_CM_ASSIGN_SERVICE_ID) cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++); else cm_id_priv->id.service_id = service_id; return 0; } /** * ib_cm_listen - Initiates listening on the specified service ID for * connection and service ID resolution requests. * @cm_id: Connection identifier associated with the listen request. * @service_id: Service identifier matched against incoming connection * and service ID resolution requests. The service ID should be specified * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will * assign a service ID to the caller. */ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id) { struct cm_id_private *cm_id_priv = container_of(cm_id, struct cm_id_private, id); unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->id.state != IB_CM_IDLE) { ret = -EINVAL; goto out; } ret = cm_init_listen(cm_id_priv, service_id); if (ret) goto out; if (!cm_insert_listen(cm_id_priv, NULL)) { ret = -EBUSY; goto out; } cm_id_priv->id.state = IB_CM_LISTEN; ret = 0; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_cm_listen); /** * ib_cm_insert_listen - Create a new listening ib_cm_id and listen on * the given service ID. * * If there's an existing ID listening on that same device and service ID, * return it. * * @device: Device associated with the cm_id. All related communication will * be associated with the specified device. * @cm_handler: Callback invoked to notify the user of CM events. * @service_id: Service identifier matched against incoming connection * and service ID resolution requests. The service ID should be specified * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will * assign a service ID to the caller. * * Callers should call ib_destroy_cm_id when done with the listener ID. */ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, ib_cm_handler cm_handler, __be64 service_id) { struct cm_id_private *listen_id_priv; struct cm_id_private *cm_id_priv; int err = 0; /* Create an ID in advance, since the creation may sleep */ cm_id_priv = cm_alloc_id_priv(device, cm_handler, NULL); if (IS_ERR(cm_id_priv)) return ERR_CAST(cm_id_priv); err = cm_init_listen(cm_id_priv, service_id); if (err) { ib_destroy_cm_id(&cm_id_priv->id); return ERR_PTR(err); } spin_lock_irq(&cm_id_priv->lock); listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler); if (listen_id_priv != cm_id_priv) { spin_unlock_irq(&cm_id_priv->lock); ib_destroy_cm_id(&cm_id_priv->id); if (!listen_id_priv) return ERR_PTR(-EINVAL); return &listen_id_priv->id; } cm_id_priv->id.state = IB_CM_LISTEN; spin_unlock_irq(&cm_id_priv->lock); /* * A listen ID does not need to be in the xarray since it does not * receive mads, is not placed in the remote_id or remote_qpn rbtree, * and does not enter timewait. */ return &cm_id_priv->id; } EXPORT_SYMBOL(ib_cm_insert_listen); static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) { u64 hi_tid = 0, low_tid; lockdep_assert_held(&cm_id_priv->lock); low_tid = (u64)cm_id_priv->id.local_id; if (!cm_id_priv->av.port) return cpu_to_be64(low_tid); read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); if (cm_id_priv->av.port->mad_agent) hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32; read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return cpu_to_be64(hi_tid | low_tid); } static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, __be64 tid) { hdr->base_version = IB_MGMT_BASE_VERSION; hdr->mgmt_class = IB_MGMT_CLASS_CM; hdr->class_version = IB_CM_CLASS_VERSION; hdr->method = IB_MGMT_METHOD_SEND; hdr->attr_id = attr_id; hdr->tid = tid; } static void cm_format_mad_ece_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, __be64 tid, u32 attr_mod) { cm_format_mad_hdr(hdr, attr_id, tid); hdr->attr_mod = cpu_to_be32(attr_mod); } static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) { struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; bool pri_ext = false; __be16 lid; if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) pri_ext = opa_is_extended_lid(pri_path->opa.dlid, pri_path->opa.slid); cm_format_mad_ece_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv), param->ece.attr_mod); IBA_SET(CM_REQ_LOCAL_COMM_ID, req_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REQ_SERVICE_ID, req_msg, be64_to_cpu(param->service_id)); IBA_SET(CM_REQ_LOCAL_CA_GUID, req_msg, be64_to_cpu(cm_id_priv->id.device->node_guid)); IBA_SET(CM_REQ_LOCAL_QPN, req_msg, param->qp_num); IBA_SET(CM_REQ_INITIATOR_DEPTH, req_msg, param->initiator_depth); IBA_SET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg, param->remote_cm_response_timeout); cm_req_set_qp_type(req_msg, param->qp_type); IBA_SET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg, param->flow_control); IBA_SET(CM_REQ_STARTING_PSN, req_msg, param->starting_psn); IBA_SET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg, param->local_cm_response_timeout); IBA_SET(CM_REQ_PARTITION_KEY, req_msg, be16_to_cpu(param->primary_path->pkey)); IBA_SET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg, param->primary_path->mtu); IBA_SET(CM_REQ_MAX_CM_RETRIES, req_msg, param->max_cm_retries); if (param->qp_type != IB_QPT_XRC_INI) { IBA_SET(CM_REQ_RESPONDER_RESOURCES, req_msg, param->responder_resources); IBA_SET(CM_REQ_RETRY_COUNT, req_msg, param->retry_count); IBA_SET(CM_REQ_RNR_RETRY_COUNT, req_msg, param->rnr_retry_count); IBA_SET(CM_REQ_SRQ, req_msg, param->srq); } *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) = pri_path->sgid; *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) = pri_path->dgid; if (pri_ext) { IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) ->global.interface_id = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) ->global.interface_id = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); } if (pri_path->hop_limit <= 1) { IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, be16_to_cpu(pri_ext ? 0 : htons(ntohl(sa_path_get_slid( pri_path))))); IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, be16_to_cpu(pri_ext ? 0 : htons(ntohl(sa_path_get_dlid( pri_path))))); } else { if (param->primary_path_inbound) { lid = param->primary_path_inbound->ib.dlid; IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, be16_to_cpu(lid)); } else IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, be16_to_cpu(IB_LID_PERMISSIVE)); /* Work-around until there's a way to obtain remote LID info */ IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, be16_to_cpu(IB_LID_PERMISSIVE)); } IBA_SET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg, be32_to_cpu(pri_path->flow_label)); IBA_SET(CM_REQ_PRIMARY_PACKET_RATE, req_msg, pri_path->rate); IBA_SET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg, pri_path->traffic_class); IBA_SET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg, pri_path->hop_limit); IBA_SET(CM_REQ_PRIMARY_SL, req_msg, pri_path->sl); IBA_SET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg, (pri_path->hop_limit <= 1)); IBA_SET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, pri_path->packet_life_time)); if (alt_path) { bool alt_ext = false; if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA) alt_ext = opa_is_extended_lid(alt_path->opa.dlid, alt_path->opa.slid); *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg) = alt_path->sgid; *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg) = alt_path->dgid; if (alt_ext) { IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg) ->global.interface_id = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg) ->global.interface_id = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); } if (alt_path->hop_limit <= 1) { IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, be16_to_cpu( alt_ext ? 0 : htons(ntohl(sa_path_get_slid( alt_path))))); IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, be16_to_cpu( alt_ext ? 0 : htons(ntohl(sa_path_get_dlid( alt_path))))); } else { IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, be16_to_cpu(IB_LID_PERMISSIVE)); IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, be16_to_cpu(IB_LID_PERMISSIVE)); } IBA_SET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg, be32_to_cpu(alt_path->flow_label)); IBA_SET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg, alt_path->rate); IBA_SET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg, alt_path->traffic_class); IBA_SET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg, alt_path->hop_limit); IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, alt_path->sl); IBA_SET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg, (alt_path->hop_limit <= 1)); IBA_SET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alt_path->packet_life_time)); } IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id); if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_REQ_PRIVATE_DATA, req_msg, param->private_data, param->private_data_len); } static int cm_validate_req_param(struct ib_cm_req_param *param) { if (!param->primary_path) return -EINVAL; if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC && param->qp_type != IB_QPT_XRC_INI) return -EINVAL; if (param->private_data && param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE) return -EINVAL; if (param->alternate_path && (param->alternate_path->pkey != param->primary_path->pkey || param->alternate_path->mtu != param->primary_path->mtu)) return -EINVAL; return 0; } int ib_send_cm_req(struct ib_cm_id *cm_id, struct ib_cm_req_param *param) { struct cm_av av = {}, alt_av = {}; struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; struct cm_req_msg *req_msg; unsigned long flags; int ret; ret = cm_validate_req_param(param); if (ret) return ret; /* Verify that we're not in timewait. */ cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); return -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; return ret; } ret = cm_init_av_by_path(param->primary_path, param->ppath_sgid_attr, &av); if (ret) return ret; if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av); if (ret) { cm_destroy_av(&av); return ret; } } cm_id->service_id = param->service_id; cm_id_priv->timeout_ms = cm_convert_to_ms( param->primary_path->packet_life_time) * 2 + cm_convert_to_ms( param->remote_cm_response_timeout); cm_id_priv->max_cm_retries = param->max_cm_retries; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->retry_count = param->retry_count; cm_id_priv->path_mtu = param->primary_path->mtu; cm_id_priv->pkey = param->primary_path->pkey; cm_id_priv->qp_type = param->qp_type; spin_lock_irqsave(&cm_id_priv->lock, flags); cm_move_av_from_path(&cm_id_priv->av, &av); if (param->primary_path_outbound) cm_id_priv->av.dlid_datapath = be16_to_cpu(param->primary_path_outbound->ib.dlid); if (param->alternate_path) cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REQ_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out_unlock; } req_msg = (struct cm_req_msg *)msg->mad; cm_format_req(req_msg, cm_id_priv, param); cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); trace_icm_send_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto out_free; BUG_ON(cm_id->state != IB_CM_IDLE); cm_id->state = IB_CM_REQ_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; out_free: cm_free_priv_msg(msg); out_unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_req); static int cm_issue_rej(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, enum ib_cm_rej_reason reason, enum cm_msg_response msg_rejected, void *ari, u8 ari_length) { struct ib_mad_send_buf *msg = NULL; struct cm_rej_msg *rej_msg, *rcv_msg; int ret; ret = cm_alloc_response_msg(port, mad_recv_wc, false, &msg); if (ret) return ret; /* We just need common CM header information. Cast to any message. */ rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad; rej_msg = (struct cm_rej_msg *) msg->mad; cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid); IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg)); IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, msg_rejected); IBA_SET(CM_REJ_REASON, rej_msg, reason); if (ari && ari_length) { IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length); IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); } trace_icm_issue_rej( IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg), IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); return ret; } static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) { return ((cpu_to_be16( IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg))) || (ib_is_opa_gid(IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg)))); } static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num, struct sa_path_rec *path, union ib_gid *gid) { if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) path->rec_type = SA_PATH_REC_TYPE_OPA; else path->rec_type = SA_PATH_REC_TYPE_IB; } static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, struct sa_path_rec *alt_path, struct ib_wc *wc) { u32 lid; if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { sa_path_set_dlid(primary_path, wc->slid); sa_path_set_slid(primary_path, IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg)); } else { lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg)); sa_path_set_dlid(primary_path, lid); lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg)); sa_path_set_slid(primary_path, lid); } if (!cm_req_has_alt_path(req_msg)) return; if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) { sa_path_set_dlid(alt_path, IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg)); sa_path_set_slid(alt_path, IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg)); } else { lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg)); sa_path_set_dlid(alt_path, lid); lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg)); sa_path_set_slid(alt_path, lid); } } static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, struct sa_path_rec *alt_path, struct ib_wc *wc) { primary_path->dgid = *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg); primary_path->sgid = *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg); primary_path->flow_label = cpu_to_be32(IBA_GET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg)); primary_path->hop_limit = IBA_GET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg); primary_path->traffic_class = IBA_GET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg); primary_path->reversible = 1; primary_path->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); primary_path->sl = IBA_GET(CM_REQ_PRIMARY_SL, req_msg); primary_path->mtu_selector = IB_SA_EQ; primary_path->mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); primary_path->rate_selector = IB_SA_EQ; primary_path->rate = IBA_GET(CM_REQ_PRIMARY_PACKET_RATE, req_msg); primary_path->packet_life_time_selector = IB_SA_EQ; primary_path->packet_life_time = IBA_GET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg); primary_path->packet_life_time -= (primary_path->packet_life_time > 0); primary_path->service_id = cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); if (sa_path_is_roce(primary_path)) primary_path->roce.route_resolved = false; if (cm_req_has_alt_path(req_msg)) { alt_path->dgid = *IBA_GET_MEM_PTR( CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg); alt_path->sgid = *IBA_GET_MEM_PTR( CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg); alt_path->flow_label = cpu_to_be32( IBA_GET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg)); alt_path->hop_limit = IBA_GET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg); alt_path->traffic_class = IBA_GET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg); alt_path->reversible = 1; alt_path->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); alt_path->sl = IBA_GET(CM_REQ_ALTERNATE_SL, req_msg); alt_path->mtu_selector = IB_SA_EQ; alt_path->mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); alt_path->rate_selector = IB_SA_EQ; alt_path->rate = IBA_GET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg); alt_path->packet_life_time_selector = IB_SA_EQ; alt_path->packet_life_time = IBA_GET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg); alt_path->packet_life_time -= (alt_path->packet_life_time > 0); alt_path->service_id = cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); if (sa_path_is_roce(alt_path)) alt_path->roce.route_resolved = false; } cm_format_path_lid_from_req(req_msg, primary_path, alt_path, wc); } static u16 cm_get_bth_pkey(struct cm_work *work) { struct ib_device *ib_dev = work->port->cm_dev->ib_device; u32 port_num = work->port->port_num; u16 pkey_index = work->mad_recv_wc->wc->pkey_index; u16 pkey; int ret; ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); if (ret) { dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %u, pkey index %u). %d\n", port_num, pkey_index, ret); return 0; } return pkey; } /** * cm_opa_to_ib_sgid - Convert OPA SGID to IB SGID * ULPs (such as IPoIB) do not understand OPA GIDs and will * reject them as the local_gid will not match the sgid. Therefore, * change the pathrec's SGID to an IB SGID. * * @work: Work completion * @path: Path record */ static void cm_opa_to_ib_sgid(struct cm_work *work, struct sa_path_rec *path) { struct ib_device *dev = work->port->cm_dev->ib_device; u32 port_num = work->port->port_num; if (rdma_cap_opa_ah(dev, port_num) && (ib_is_opa_gid(&path->sgid))) { union ib_gid sgid; if (rdma_query_gid(dev, port_num, 0, &sgid)) { dev_warn(&dev->dev, "Error updating sgid in CM request\n"); return; } path->sgid = sgid; } } static void cm_format_req_event(struct cm_work *work, struct cm_id_private *cm_id_priv, struct ib_cm_id *listen_id) { struct cm_req_msg *req_msg; struct ib_cm_req_event_param *param; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.req_rcvd; param->listen_id = listen_id; param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; cm_opa_to_ib_sgid(work, param->primary_path); if (cm_req_has_alt_path(req_msg)) { param->alternate_path = &work->path[1]; cm_opa_to_ib_sgid(work, param->alternate_path); } else { param->alternate_path = NULL; } param->remote_ca_guid = cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg)); param->remote_qkey = IBA_GET(CM_REQ_LOCAL_Q_KEY, req_msg); param->remote_qpn = IBA_GET(CM_REQ_LOCAL_QPN, req_msg); param->qp_type = cm_req_get_qp_type(req_msg); param->starting_psn = IBA_GET(CM_REQ_STARTING_PSN, req_msg); param->responder_resources = IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); param->initiator_depth = IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); param->local_cm_response_timeout = IBA_GET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg); param->flow_control = IBA_GET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg); param->remote_cm_response_timeout = IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg); param->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); param->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); param->srq = IBA_GET(CM_REQ_SRQ, req_msg); param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; param->ece.vendor_id = IBA_GET(CM_REQ_VENDOR_ID, req_msg); param->ece.attr_mod = be32_to_cpu(req_msg->hdr.attr_mod); work->cm_event.private_data = IBA_GET_MEM_PTR(CM_REQ_PRIVATE_DATA, req_msg); } static void cm_process_work(struct cm_id_private *cm_id_priv, struct cm_work *work) { int ret; /* We will typically only have the current event to report. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) { spin_lock_irq(&cm_id_priv->lock); work = cm_dequeue_work(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); if (!work) return; ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); } cm_deref_id(cm_id_priv); if (ret) cm_destroy_id(&cm_id_priv->id, ret); } static void cm_format_mra(struct cm_mra_msg *mra_msg, struct cm_id_private *cm_id_priv, enum cm_msg_response msg_mraed, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); IBA_SET(CM_MRA_MESSAGE_MRAED, mra_msg, msg_mraed); IBA_SET(CM_MRA_LOCAL_COMM_ID, mra_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg, be32_to_cpu(cm_id_priv->id.remote_id)); IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING); if (private_data && private_data_len) IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data, private_data_len); } static void cm_format_rej(struct cm_rej_msg *rej_msg, struct cm_id_private *cm_id_priv, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len, enum ib_cm_state state) { lockdep_assert_held(&cm_id_priv->lock); cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid); IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, be32_to_cpu(cm_id_priv->id.remote_id)); switch (state) { case IB_CM_REQ_RCVD: IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(0)); IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_MRA_REQ_SENT: IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REP); break; default: IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_OTHER); break; } IBA_SET(CM_REJ_REASON, rej_msg, reason); if (ari && ari_length) { IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length); IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); } if (private_data && private_data_len) IBA_SET_MEM(CM_REJ_PRIVATE_DATA, rej_msg, private_data, private_data_len); } static void cm_dup_req_handler(struct cm_work *work, struct cm_id_private *cm_id_priv) { struct ib_mad_send_buf *msg = NULL; int ret; atomic_long_inc( &work->port->counters[CM_RECV_DUPLICATES][CM_REQ_COUNTER]); /* Quick state check to discard duplicate REQs. */ spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state == IB_CM_REQ_RCVD) { spin_unlock_irq(&cm_id_priv->lock); return; } spin_unlock_irq(&cm_id_priv->lock); ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) return; spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_MRA_REQ_SENT: cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_REQ, cm_id_priv->private_data, cm_id_priv->private_data_len); break; case IB_CM_TIMEWAIT: cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0, IB_CM_TIMEWAIT); break; default: goto unlock; } spin_unlock_irq(&cm_id_priv->lock); trace_icm_send_dup_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; return; unlock: spin_unlock_irq(&cm_id_priv->lock); free: cm_free_msg(msg); } static struct cm_id_private *cm_match_req(struct cm_work *work, struct cm_id_private *cm_id_priv) { struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; struct cm_timewait_info *timewait_info; struct cm_req_msg *req_msg; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; /* Check for possible duplicate REQ. */ spin_lock_irq(&cm.lock); timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info); if (timewait_info) { cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock_irq(&cm.lock); if (cur_cm_id_priv) { cm_dup_req_handler(work, cur_cm_id_priv); cm_deref_id(cur_cm_id_priv); } return NULL; } /* Check for stale connections. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { cm_remove_remote(cm_id_priv); cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, NULL, 0); if (cur_cm_id_priv) { ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } return NULL; } /* Find matching listen request. */ listen_cm_id_priv = cm_find_listen( cm_id_priv->id.device, cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg))); if (!listen_cm_id_priv) { cm_remove_remote(cm_id_priv); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, NULL, 0); return NULL; } spin_unlock_irq(&cm.lock); return listen_cm_id_priv; } /* * Work-around for inter-subnet connections. If the LIDs are permissive, * we need to override the LID/SL data in the REQ with the LID information * in the work completion. */ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!IBA_GET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg)) { if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg)) == IB_LID_PERMISSIVE) { IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, be16_to_cpu(ib_lid_be16(wc->slid))); IBA_SET(CM_REQ_PRIMARY_SL, req_msg, wc->sl); } if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg)) == IB_LID_PERMISSIVE) IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, wc->dlid_path_bits); } if (!IBA_GET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg)) { if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg)) == IB_LID_PERMISSIVE) { IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, be16_to_cpu(ib_lid_be16(wc->slid))); IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, wc->sl); } if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg)) == IB_LID_PERMISSIVE) IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, wc->dlid_path_bits); } } static int cm_req_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; const struct ib_global_route *grh; const struct ib_gid_attr *gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id_priv)) return PTR_ERR(cm_id_priv); cm_id_priv->id.remote_id = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg)); cm_id_priv->id.service_id = cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->timeout_ms = cm_convert_to_ms( IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg)); cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg); cm_id_priv->remote_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); cm_id_priv->initiator_depth = IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); cm_id_priv->responder_resources = IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); if (ret) goto destroy; cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; goto destroy; } cm_id_priv->timewait_info->work.remote_id = cm_id_priv->id.remote_id; cm_id_priv->timewait_info->remote_ca_guid = cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg)); cm_id_priv->timewait_info->remote_qpn = cm_id_priv->remote_qpn; /* * Note that the ID pointer is not in the xarray at this point, * so this set is only visible to the local thread. */ cm_id_priv->id.state = IB_CM_REQ_RCVD; listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { trace_icm_no_listener_err(&cm_id_priv->id); cm_id_priv->id.state = IB_CM_IDLE; ret = -EINVAL; goto destroy; } memset(&work->path[0], 0, sizeof(work->path[0])); if (cm_req_has_alt_path(req_msg)) memset(&work->path[1], 0, sizeof(work->path[1])); grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr); gid_attr = grh->sgid_attr; if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) { work->path[0].rec_type = sa_conv_gid_to_pathrec_type(gid_attr->gid_type); } else { cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_path_set_rec_type( work->port->cm_dev->ib_device, work->port->port_num, &work->path[0], IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg)); } if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1], work->mad_recv_wc->wc); if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) sa_path_set_dmac(&work->path[0], cm_id_priv->av.ah_attr.roce.dmac); work->path[0].hop_limit = grh->hop_limit; /* This destroy call is needed to pair with cm_init_av_for_response */ cm_destroy_av(&cm_id_priv->av); ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av); if (ret) { int err; err = rdma_query_gid(work->port->cm_dev->ib_device, work->port->port_num, 0, &work->path[0].sgid); if (err) ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, NULL, 0, NULL, 0); else ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof(work->path[0].sgid), NULL, 0); goto rejected; } if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_IB) cm_id_priv->av.dlid_datapath = IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg); if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], NULL, &cm_id_priv->alt_av); if (ret) { ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, sizeof(work->path[0].sgid), NULL, 0); goto rejected; } } cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; cm_id_priv->id.context = listen_cm_id_priv->id.context; cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); /* Now MAD handlers can see the new ID */ spin_lock_irq(&cm_id_priv->lock); cm_finalize_id(cm_id_priv); /* Refcount belongs to the event, pairs with cm_process_work() */ refcount_inc(&cm_id_priv->refcount); cm_queue_work_unlock(cm_id_priv, work); /* * Since this ID was just created and was not made visible to other MAD * handlers until the cm_finalize_id() above we know that the * cm_process_work() will deliver the event and the listen_cm_id * embedded in the event can be derefed here. */ cm_deref_id(listen_cm_id_priv); return 0; rejected: cm_deref_id(listen_cm_id_priv); destroy: ib_destroy_cm_id(&cm_id_priv->id); return ret; } static void cm_format_rep(struct cm_rep_msg *rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_rep_param *param) { cm_format_mad_ece_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid, param->ece.attr_mod); IBA_SET(CM_REP_LOCAL_COMM_ID, rep_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REP_REMOTE_COMM_ID, rep_msg, be32_to_cpu(cm_id_priv->id.remote_id)); IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn); IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg, param->responder_resources); IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, cm_id_priv->av.port->cm_dev->ack_delay); IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted); IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count); IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg, be64_to_cpu(cm_id_priv->id.device->node_guid)); if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { IBA_SET(CM_REP_INITIATOR_DEPTH, rep_msg, param->initiator_depth); IBA_SET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg, param->flow_control); IBA_SET(CM_REP_SRQ, rep_msg, param->srq); IBA_SET(CM_REP_LOCAL_QPN, rep_msg, param->qp_num); } else { IBA_SET(CM_REP_SRQ, rep_msg, 1); IBA_SET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, rep_msg, param->qp_num); } IBA_SET(CM_REP_VENDOR_ID_L, rep_msg, param->ece.vendor_id); IBA_SET(CM_REP_VENDOR_ID_M, rep_msg, param->ece.vendor_id >> 8); IBA_SET(CM_REP_VENDOR_ID_H, rep_msg, param->ece.vendor_id >> 16); if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_REP_PRIVATE_DATA, rep_msg, param->private_data, param->private_data_len); } int ib_send_cm_rep(struct ib_cm_id *cm_id, struct ib_cm_rep_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; struct cm_rep_msg *rep_msg; unsigned long flags; int ret; if (param->private_data && param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); ret = -EINVAL; goto out; } msg = cm_alloc_priv_msg_rep(cm_id_priv, IB_CM_REP_SENT, true); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out; } rep_msg = (struct cm_rep_msg *) msg->mad; cm_format_rep(rep_msg, cm_id_priv, param); trace_icm_send_rep(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) goto out_free; cm_id->state = IB_CM_REP_SENT; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg)); WARN_ONCE(param->qp_num & 0xFF000000, "IBTA declares QPN to be 24 bits, but it is 0x%X\n", param->qp_num); cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; out_free: cm_free_priv_msg(msg); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rep); static void cm_format_rtu(struct cm_rtu_msg *rtu_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid); IBA_SET(CM_RTU_LOCAL_COMM_ID, rtu_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_RTU_REMOTE_COMM_ID, rtu_msg, be32_to_cpu(cm_id_priv->id.remote_id)); if (private_data && private_data_len) IBA_SET_MEM(CM_RTU_PRIVATE_DATA, rtu_msg, private_data, private_data_len); } int ib_send_cm_rtu(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; void *data; int ret; if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE) return -EINVAL; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { trace_icm_send_cm_rtu_err(cm_id); ret = -EINVAL; goto error; } msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto error; } cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, private_data, private_data_len); trace_icm_send_rtu(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); kfree(data); return ret; } cm_id->state = IB_CM_ESTABLISHED; cm_set_private_data(cm_id_priv, data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); return ret; } EXPORT_SYMBOL(ib_send_cm_rtu); static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) { struct cm_rep_msg *rep_msg; struct ib_cm_rep_event_param *param; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rep_rcvd; param->remote_ca_guid = cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg)); param->remote_qkey = IBA_GET(CM_REP_LOCAL_Q_KEY, rep_msg); param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); param->starting_psn = IBA_GET(CM_REP_STARTING_PSN, rep_msg); param->responder_resources = IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg); param->initiator_depth = IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg); param->target_ack_delay = IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg); param->failover_accepted = IBA_GET(CM_REP_FAILOVER_ACCEPTED, rep_msg); param->flow_control = IBA_GET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg); param->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); param->srq = IBA_GET(CM_REP_SRQ, rep_msg); param->ece.vendor_id = IBA_GET(CM_REP_VENDOR_ID_H, rep_msg) << 16; param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_M, rep_msg) << 8; param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_L, rep_msg); param->ece.attr_mod = be32_to_cpu(rep_msg->hdr.attr_mod); work->cm_event.private_data = IBA_GET_MEM_PTR(CM_REP_PRIVATE_DATA, rep_msg); } static void cm_dup_rep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rep_msg *rep_msg; struct ib_mad_send_buf *msg = NULL; int ret; rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg))); if (!cm_id_priv) return; atomic_long_inc( &work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]); ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) goto deref; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state == IB_CM_ESTABLISHED) cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, cm_id_priv->private_data, cm_id_priv->private_data_len); else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_REP, cm_id_priv->private_data, cm_id_priv->private_data_len); else goto unlock; spin_unlock_irq(&cm_id_priv->lock); trace_icm_send_dup_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; goto deref; unlock: spin_unlock_irq(&cm_id_priv->lock); free: cm_free_msg(msg); deref: cm_deref_id(cm_id_priv); } static int cm_rep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rep_msg *rep_msg; int ret; struct cm_id_private *cur_cm_id_priv; struct cm_timewait_info *timewait_info; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); if (!cm_id_priv) { cm_dup_rep_handler(work); trace_icm_remote_no_priv_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); return -EINVAL; } cm_format_rep_event(work, cm_id_priv->qp_type); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: break; default: ret = -EINVAL; trace_icm_rep_unknown_err( IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), cm_id_priv->id.state); spin_unlock_irq(&cm_id_priv->lock); goto error; } cm_id_priv->timewait_info->work.remote_id = cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)); cm_id_priv->timewait_info->remote_ca_guid = cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg)); cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); spin_lock(&cm.lock); /* Check for duplicate REP. */ if (cm_insert_remote_id(cm_id_priv->timewait_info)) { spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; trace_icm_insert_failed_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); goto error; } /* Check for a stale connection. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { cm_remove_remote(cm_id_priv); cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; trace_icm_staleconn_err( IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); if (cur_cm_id_priv) { ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } goto error; } spin_unlock(&cm.lock); cm_id_priv->id.state = IB_CM_REP_RCVD; cm_id_priv->id.remote_id = cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)); cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); cm_id_priv->initiator_depth = IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg); cm_id_priv->responder_resources = IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg); cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg)); cm_id_priv->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); cm_id_priv->target_ack_delay = IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg); cm_id_priv->av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->av.timeout - 1); cm_id_priv->alt_av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->alt_av.timeout - 1); ib_cancel_mad(cm_id_priv->msg); cm_queue_work_unlock(cm_id_priv, work); return 0; error: cm_deref_id(cm_id_priv); return ret; } static int cm_establish_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; /* See comment in cm_establish about lookup. */ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); if (!cm_id_priv) return -EINVAL; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { spin_unlock_irq(&cm_id_priv->lock); goto out; } ib_cancel_mad(cm_id_priv->msg); cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_rtu_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rtu_msg *rtu_msg; rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_RTU_REMOTE_COMM_ID, rtu_msg)), cpu_to_be32(IBA_GET(CM_RTU_LOCAL_COMM_ID, rtu_msg))); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = IBA_GET_MEM_PTR(CM_RTU_PRIVATE_DATA, rtu_msg); spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_REP_SENT && cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { spin_unlock_irq(&cm_id_priv->lock); atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_RTU_COUNTER]); goto out; } cm_id_priv->id.state = IB_CM_ESTABLISHED; ib_cancel_mad(cm_id_priv->msg); cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID, cm_form_tid(cm_id_priv)); IBA_SET(CM_DREQ_LOCAL_COMM_ID, dreq_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_DREQ_REMOTE_COMM_ID, dreq_msg, be32_to_cpu(cm_id_priv->id.remote_id)); IBA_SET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg, be32_to_cpu(cm_id_priv->remote_qpn)); if (private_data && private_data_len) IBA_SET_MEM(CM_DREQ_PRIVATE_DATA, dreq_msg, private_data, private_data_len); } static void cm_issue_dreq(struct cm_id_private *cm_id_priv) { struct ib_mad_send_buf *msg; int ret; lockdep_assert_held(&cm_id_priv->lock); msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) return; cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, NULL, 0); trace_icm_send_dreq(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); } int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv = container_of(cm_id, struct cm_id_private, id); struct ib_mad_send_buf *msg; unsigned long flags; int ret; if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) return -EINVAL; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { trace_icm_dreq_skipped(&cm_id_priv->id); ret = -EINVAL; goto unlock; } if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->msg); msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_DREQ_SENT); if (IS_ERR(msg)) { cm_enter_timewait(cm_id_priv); ret = PTR_ERR(msg); goto unlock; } cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, private_data, private_data_len); trace_icm_send_dreq(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_enter_timewait(cm_id_priv); cm_free_priv_msg(msg); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_SENT; unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_dreq); static void cm_format_drep(struct cm_drep_msg *drep_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid); IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg, be32_to_cpu(cm_id_priv->id.remote_id)); if (private_data && private_data_len) IBA_SET_MEM(CM_DREP_PRIVATE_DATA, drep_msg, private_data, private_data_len); } static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, void *private_data, u8 private_data_len) { struct ib_mad_send_buf *msg; int ret; lockdep_assert_held(&cm_id_priv->lock); if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE) return -EINVAL; if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { trace_icm_send_drep_err(&cm_id_priv->id); kfree(private_data); return -EINVAL; } cm_set_private_data(cm_id_priv, private_data, private_data_len); cm_enter_timewait(cm_id_priv); msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) return PTR_ERR(msg); cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, private_data, private_data_len); trace_icm_send_drep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_free_msg(msg); return ret; } return 0; } int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv = container_of(cm_id, struct cm_id_private, id); unsigned long flags; void *data; int ret; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); spin_lock_irqsave(&cm_id_priv->lock, flags); ret = cm_send_drep_locked(cm_id_priv, data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_drep); static int cm_issue_drep(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_buf *msg = NULL; struct cm_dreq_msg *dreq_msg; struct cm_drep_msg *drep_msg; int ret; ret = cm_alloc_response_msg(port, mad_recv_wc, true, &msg); if (ret) return ret; dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad; drep_msg = (struct cm_drep_msg *) msg->mad; cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid); IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg, IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg)); IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); trace_icm_issue_drep( IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); return ret; } static int cm_dreq_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_dreq_msg *dreq_msg; struct ib_mad_send_buf *msg = NULL; dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)), cpu_to_be32(IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg))); if (!cm_id_priv) { atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); trace_icm_no_priv_err( IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); return -EINVAL; } work->cm_event.private_data = IBA_GET_MEM_PTR(CM_DREQ_PRIVATE_DATA, dreq_msg); spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->local_qpn != cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg))) goto unlock; switch (cm_id_priv->id.state) { case IB_CM_REP_SENT: case IB_CM_DREQ_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_DREQ_COUNTER]); msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, true); if (IS_ERR(msg)) goto unlock; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_DREQ_COUNTER]); goto unlock; default: trace_icm_dreq_unknown_err(&cm_id_priv->id); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; cm_id_priv->tid = dreq_msg->hdr.tid; cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); deref: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_drep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_drep_msg *drep_msg; drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_DREP_REMOTE_COMM_ID, drep_msg)), cpu_to_be32(IBA_GET(CM_DREP_LOCAL_COMM_ID, drep_msg))); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = IBA_GET_MEM_PTR(CM_DREP_PRIVATE_DATA, drep_msg); spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_DREQ_SENT && cm_id_priv->id.state != IB_CM_DREQ_RCVD) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_enter_timewait(cm_id_priv); ib_cancel_mad(cm_id_priv->msg); cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len) { enum ib_cm_state state = cm_id_priv->id.state; struct ib_mad_send_buf *msg; int ret; lockdep_assert_held(&cm_id_priv->lock); if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) || (ari && ari_length > IB_CM_REJ_ARI_LENGTH)) return -EINVAL; trace_icm_send_rej(&cm_id_priv->id, reason); switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: cm_reset_to_idle(cm_id_priv); msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) return PTR_ERR(msg); cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, ari, ari_length, private_data, private_data_len, state); break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_enter_timewait(cm_id_priv); msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) return PTR_ERR(msg); cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, ari, ari_length, private_data, private_data_len, state); break; default: trace_icm_send_unknown_rej_err(&cm_id_priv->id); return -EINVAL; } ret = ib_post_send_mad(msg, NULL); if (ret) { cm_free_msg(msg); return ret; } return 0; } int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv = container_of(cm_id, struct cm_id_private, id); unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); ret = cm_send_rej_locked(cm_id_priv, reason, ari, ari_length, private_data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rej); static void cm_format_rej_event(struct cm_work *work) { struct cm_rej_msg *rej_msg; struct ib_cm_rej_event_param *param; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rej_rcvd; param->ari = IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg); param->ari_length = IBA_GET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg); param->reason = IBA_GET(CM_REJ_REASON, rej_msg); work->cm_event.private_data = IBA_GET_MEM_PTR(CM_REJ_PRIVATE_DATA, rej_msg); } static struct cm_id_private *cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) { struct cm_id_private *cm_id_priv; __be32 remote_id; remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg)); if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) { cm_id_priv = cm_find_remote_id( *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)), remote_id); } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) == CM_MSG_RESPONSE_REQ) cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), 0); else cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), remote_id); return cm_id_priv; } static int cm_rej_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rej_msg *rej_msg; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_rejected_id(rej_msg); if (!cm_id_priv) return -EINVAL; cm_format_rej_event(work); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->msg); fallthrough; case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_STALE_CONN) cm_enter_timewait(cm_id_priv); else cm_reset_to_idle(cm_id_priv); break; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->msg); fallthrough; case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: cm_enter_timewait(cm_id_priv); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) ib_cancel_mad(cm_id_priv->msg); cm_enter_timewait(cm_id_priv); break; } fallthrough; default: trace_icm_rej_unknown_err(&cm_id_priv->id); spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } int ib_prepare_cm_mra(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; enum ib_cm_state cm_state; enum ib_cm_lap_state lap_state; unsigned long flags; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; break; case IB_CM_REP_RCVD: cm_state = IB_CM_MRA_REP_SENT; lap_state = cm_id->lap_state; break; case IB_CM_ESTABLISHED: if (cm_id->lap_state == IB_CM_LAP_RCVD) { cm_state = cm_id->state; lap_state = IB_CM_MRA_LAP_SENT; break; } fallthrough; default: trace_icm_prepare_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; goto error_unlock; } cm_id->state = cm_state; cm_id->lap_state = lap_state; cm_set_private_data(cm_id_priv, NULL, 0); error_unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_prepare_cm_mra); static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) { case CM_MSG_RESPONSE_REQ: return cm_acquire_id( cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), 0); case CM_MSG_RESPONSE_REP: case CM_MSG_RESPONSE_OTHER: return cm_acquire_id( cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), cpu_to_be32(IBA_GET(CM_MRA_LOCAL_COMM_ID, mra_msg))); default: return NULL; } } static int cm_mra_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_mra_msg *mra_msg; int timeout; mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_mraed_id(mra_msg); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = IBA_GET_MEM_PTR(CM_MRA_PRIVATE_DATA, mra_msg); work->cm_event.param.mra_rcvd.service_timeout = IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg); timeout = cm_convert_to_ms(IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg)) + cm_convert_to_ms(cm_id_priv->av.timeout); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != CM_MSG_RESPONSE_REQ || ib_modify_mad(cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; break; case IB_CM_REP_SENT: if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != CM_MSG_RESPONSE_REP || ib_modify_mad(cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REP_RCVD; break; case IB_CM_ESTABLISHED: if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != CM_MSG_RESPONSE_OTHER || cm_id_priv->id.lap_state != IB_CM_LAP_SENT || ib_modify_mad(cm_id_priv->msg, timeout)) { if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) atomic_long_inc( &work->port->counters[CM_RECV_DUPLICATES] [CM_MRA_COUNTER]); goto out; } cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; break; case IB_CM_MRA_REQ_RCVD: case IB_CM_MRA_REP_RCVD: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_MRA_COUNTER]); fallthrough; default: trace_icm_mra_unknown_err(&cm_id_priv->id); goto out; } cm_id_priv->msg->context[1] = (void *) (unsigned long) cm_id_priv->id.state; cm_queue_work_unlock(cm_id_priv, work); return 0; out: spin_unlock_irq(&cm_id_priv->lock); cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg, struct sa_path_rec *path) { u32 lid; if (path->rec_type != SA_PATH_REC_TYPE_OPA) { sa_path_set_dlid(path, IBA_GET(CM_LAP_ALTERNATE_LOCAL_PORT_LID, lap_msg)); sa_path_set_slid(path, IBA_GET(CM_LAP_ALTERNATE_REMOTE_PORT_LID, lap_msg)); } else { lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg)); sa_path_set_dlid(path, lid); lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg)); sa_path_set_slid(path, lid); } } static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { path->dgid = *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg); path->sgid = *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg); path->flow_label = cpu_to_be32(IBA_GET(CM_LAP_ALTERNATE_FLOW_LABEL, lap_msg)); path->hop_limit = IBA_GET(CM_LAP_ALTERNATE_HOP_LIMIT, lap_msg); path->traffic_class = IBA_GET(CM_LAP_ALTERNATE_TRAFFIC_CLASS, lap_msg); path->reversible = 1; path->pkey = cm_id_priv->pkey; path->sl = IBA_GET(CM_LAP_ALTERNATE_SL, lap_msg); path->mtu_selector = IB_SA_EQ; path->mtu = cm_id_priv->path_mtu; path->rate_selector = IB_SA_EQ; path->rate = IBA_GET(CM_LAP_ALTERNATE_PACKET_RATE, lap_msg); path->packet_life_time_selector = IB_SA_EQ; path->packet_life_time = IBA_GET(CM_LAP_ALTERNATE_LOCAL_ACK_TIMEOUT, lap_msg); path->packet_life_time -= (path->packet_life_time > 0); cm_format_path_lid_from_lap(lap_msg, path); } static int cm_lap_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_lap_msg *lap_msg; struct ib_cm_lap_event_param *param; struct ib_mad_send_buf *msg = NULL; struct rdma_ah_attr ah_attr; struct cm_av alt_av = {}; int ret; /* Currently Alternate path messages are not supported for * RoCE link layer. */ if (rdma_protocol_roce(work->port->cm_dev->ib_device, work->port->port_num)) return -EINVAL; /* todo: verify LAP request and send reject APR if invalid. */ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_LAP_REMOTE_COMM_ID, lap_msg)), cpu_to_be32(IBA_GET(CM_LAP_LOCAL_COMM_ID, lap_msg))); if (!cm_id_priv) return -EINVAL; param = &work->cm_event.param.lap_rcvd; memset(&work->path[0], 0, sizeof(work->path[1])); cm_path_set_rec_type(work->port->cm_dev->ib_device, work->port->port_num, &work->path[0], IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg)); param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); work->cm_event.private_data = IBA_GET_MEM_PTR(CM_LAP_PRIVATE_DATA, lap_msg); ret = ib_init_ah_attr_from_wc(work->port->cm_dev->ib_device, work->port->port_num, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &ah_attr); if (ret) goto deref; ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av); if (ret) { rdma_destroy_ah_attr(&ah_attr); goto deref; } spin_lock_irq(&cm_id_priv->lock); cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, &ah_attr, &cm_id_priv->av); cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) goto unlock; switch (cm_id_priv->id.lap_state) { case IB_CM_LAP_UNINIT: case IB_CM_LAP_IDLE: break; case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_LAP_COUNTER]); msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, true); if (IS_ERR(msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_LAP_COUNTER]); goto unlock; default: goto unlock; } cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); deref: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_apr_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_apr_msg *apr_msg; /* Currently Alternate path messages are not supported for * RoCE link layer. */ if (rdma_protocol_roce(work->port->cm_dev->ib_device, work->port->port_num)) return -EINVAL; apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_APR_REMOTE_COMM_ID, apr_msg)), cpu_to_be32(IBA_GET(CM_APR_LOCAL_COMM_ID, apr_msg))); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ work->cm_event.param.apr_rcvd.ap_status = IBA_GET(CM_APR_AR_STATUS, apr_msg); work->cm_event.param.apr_rcvd.apr_info = IBA_GET_MEM_PTR(CM_APR_ADDITIONAL_INFORMATION, apr_msg); work->cm_event.param.apr_rcvd.info_len = IBA_GET(CM_APR_ADDITIONAL_INFORMATION_LENGTH, apr_msg); work->cm_event.private_data = IBA_GET_MEM_PTR(CM_APR_PRIVATE_DATA, apr_msg); spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED || (cm_id_priv->id.lap_state != IB_CM_LAP_SENT && cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; ib_cancel_mad(cm_id_priv->msg); cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_timewait_handler(struct cm_work *work) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; timewait_info = container_of(work, struct cm_timewait_info, work); spin_lock_irq(&cm.lock); list_del(&timewait_info->list); spin_unlock_irq(&cm.lock); cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); if (!cm_id_priv) return -EINVAL; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_TIMEWAIT || cm_id_priv->remote_qpn != timewait_info->remote_qpn) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.state = IB_CM_IDLE; cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_req_param *param) { cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, cm_form_tid(cm_id_priv)); IBA_SET(CM_SIDR_REQ_REQUESTID, sidr_req_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg, be16_to_cpu(param->path->pkey)); IBA_SET(CM_SIDR_REQ_SERVICEID, sidr_req_msg, be64_to_cpu(param->service_id)); if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg, param->private_data, param->private_data_len); } int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, struct ib_cm_sidr_req_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; struct cm_av av = {}; unsigned long flags; int ret; if (!param->path || (param->private_data && param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); ret = cm_init_av_by_path(param->path, param->sgid_attr, &av); if (ret) return ret; spin_lock_irqsave(&cm_id_priv->lock, flags); cm_move_av_from_path(&cm_id_priv->av, &av); cm_id->service_id = param->service_id; cm_id_priv->timeout_ms = param->timeout_ms; cm_id_priv->max_cm_retries = param->max_cm_retries; if (cm_id->state != IB_CM_IDLE) { ret = -EINVAL; goto out_unlock; } msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_SIDR_REQ_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out_unlock; } cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv, param); trace_icm_send_sidr_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto out_free; cm_id->state = IB_CM_SIDR_REQ_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; out_free: cm_free_priv_msg(msg); out_unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); static void cm_format_sidr_req_event(struct cm_work *work, const struct cm_id_private *rx_cm_id, struct ib_cm_id *listen_id) { struct cm_sidr_req_msg *sidr_req_msg; struct ib_cm_sidr_req_event_param *param; sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_req_rcvd; param->pkey = IBA_GET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg); param->listen_id = listen_id; param->service_id = cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); param->bth_pkey = cm_get_bth_pkey(work); param->port = work->port->port_num; param->sgid_attr = rx_cm_id->av.ah_attr.grh.sgid_attr; work->cm_event.private_data = IBA_GET_MEM_PTR(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg); } static int cm_sidr_req_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; int ret; cm_id_priv = cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id_priv)) return PTR_ERR(cm_id_priv); /* Record SGID/SLID and request ID for lookup. */ sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; cm_id_priv->id.remote_id = cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg)); cm_id_priv->id.service_id = cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); cm_id_priv->tid = sidr_req_msg->hdr.tid; wc = work->mad_recv_wc->wc; cm_id_priv->sidr_slid = wc->slid; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); if (ret) goto out; spin_lock_irq(&cm.lock); listen_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); if (listen_cm_id_priv) { spin_unlock_irq(&cm.lock); atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_SIDR_REQ_COUNTER]); goto out; /* Duplicate message. */ } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, cm_id_priv->id.service_id); if (!listen_cm_id_priv) { spin_unlock_irq(&cm.lock); ib_send_cm_sidr_rep(&cm_id_priv->id, &(struct ib_cm_sidr_rep_param){ .status = IB_SIDR_UNSUPPORTED }); goto out; /* No match. */ } spin_unlock_irq(&cm.lock); cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; cm_id_priv->id.context = listen_cm_id_priv->id.context; /* * A SIDR ID does not need to be in the xarray since it does not receive * mads, is not placed in the remote_id or remote_qpn rbtree, and does * not enter timewait. */ cm_format_sidr_req_event(work, cm_id_priv, &listen_cm_id_priv->id); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); /* * A pointer to the listen_cm_id is held in the event, so this deref * must be after the event is delivered above. */ cm_deref_id(listen_cm_id_priv); if (ret) cm_destroy_id(&cm_id_priv->id, ret); return 0; out: ib_destroy_cm_id(&cm_id_priv->id); return -EINVAL; } static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param) { cm_format_mad_ece_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, cm_id_priv->tid, param->ece.attr_mod); IBA_SET(CM_SIDR_REP_REQUESTID, sidr_rep_msg, be32_to_cpu(cm_id_priv->id.remote_id)); IBA_SET(CM_SIDR_REP_STATUS, sidr_rep_msg, param->status); IBA_SET(CM_SIDR_REP_QPN, sidr_rep_msg, param->qp_num); IBA_SET(CM_SIDR_REP_SERVICEID, sidr_rep_msg, be64_to_cpu(cm_id_priv->id.service_id)); IBA_SET(CM_SIDR_REP_Q_KEY, sidr_rep_msg, param->qkey); IBA_SET(CM_SIDR_REP_VENDOR_ID_L, sidr_rep_msg, param->ece.vendor_id & 0xFF); IBA_SET(CM_SIDR_REP_VENDOR_ID_H, sidr_rep_msg, (param->ece.vendor_id >> 8) & 0xFF); if (param->info && param->info_length) IBA_SET_MEM(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg, param->info, param->info_length); if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg, param->private_data, param->private_data_len); } static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param) { struct ib_mad_send_buf *msg; unsigned long flags; int ret; lockdep_assert_held(&cm_id_priv->lock); if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) || (param->private_data && param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE)) return -EINVAL; if (cm_id_priv->id.state != IB_CM_SIDR_REQ_RCVD) return -EINVAL; msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) return PTR_ERR(msg); cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, param); trace_icm_send_sidr_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_free_msg(msg); return ret; } cm_id_priv->id.state = IB_CM_IDLE; spin_lock_irqsave(&cm.lock, flags); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); } spin_unlock_irqrestore(&cm.lock, flags); return 0; } int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param) { struct cm_id_private *cm_id_priv = container_of(cm_id, struct cm_id_private, id); unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); ret = cm_send_sidr_rep_locked(cm_id_priv, param); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_rep); static void cm_format_sidr_rep_event(struct cm_work *work, const struct cm_id_private *cm_id_priv) { struct cm_sidr_rep_msg *sidr_rep_msg; struct ib_cm_sidr_rep_event_param *param; sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_rep_rcvd; param->status = IBA_GET(CM_SIDR_REP_STATUS, sidr_rep_msg); param->qkey = IBA_GET(CM_SIDR_REP_Q_KEY, sidr_rep_msg); param->qpn = IBA_GET(CM_SIDR_REP_QPN, sidr_rep_msg); param->info = IBA_GET_MEM_PTR(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg); param->info_len = IBA_GET(CM_SIDR_REP_ADDITIONAL_INFORMATION_LENGTH, sidr_rep_msg); param->sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; work->cm_event.private_data = IBA_GET_MEM_PTR(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg); } static int cm_sidr_rep_handler(struct cm_work *work) { struct cm_sidr_rep_msg *sidr_rep_msg; struct cm_id_private *cm_id_priv; sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg)), 0); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.state = IB_CM_IDLE; ib_cancel_mad(cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); cm_format_sidr_rep_event(work, cm_id_priv); cm_process_work(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_process_send_error(struct cm_id_private *cm_id_priv, struct ib_mad_send_buf *msg, enum ib_wc_status wc_status) { enum ib_cm_state state = (unsigned long) msg->context[1]; struct ib_cm_event cm_event = {}; int ret; /* Discard old sends. */ spin_lock_irq(&cm_id_priv->lock); if (msg != cm_id_priv->msg) { spin_unlock_irq(&cm_id_priv->lock); cm_free_msg(msg); cm_deref_id(cm_id_priv); return; } cm_free_priv_msg(msg); if (state != cm_id_priv->id.state || wc_status == IB_WC_SUCCESS || wc_status == IB_WC_WR_FLUSH_ERR) goto out_unlock; trace_icm_mad_send_err(state, wc_status); switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: cm_reset_to_idle(cm_id_priv); cm_event.event = IB_CM_REQ_ERROR; break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_reset_to_idle(cm_id_priv); cm_event.event = IB_CM_REP_ERROR; break; case IB_CM_DREQ_SENT: cm_enter_timewait(cm_id_priv); cm_event.event = IB_CM_DREQ_ERROR; break; case IB_CM_SIDR_REQ_SENT: cm_id_priv->id.state = IB_CM_IDLE; cm_event.event = IB_CM_SIDR_REQ_ERROR; break; default: goto out_unlock; } spin_unlock_irq(&cm_id_priv->lock); cm_event.param.send_status = wc_status; /* No other events can occur on the cm_id at this point. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event); if (ret) ib_destroy_cm_id(&cm_id_priv->id); return; out_unlock: spin_unlock_irq(&cm_id_priv->lock); } static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; struct cm_id_private *cm_id_priv; struct cm_port *port; u16 attr_index; port = mad_agent->context; attr_index = be16_to_cpu(((struct ib_mad_hdr *) msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; if (msg->context[0] == CM_DIRECT_RETRY_CTX) { msg->retries = 1; cm_id_priv = NULL; } else { cm_id_priv = msg->context[0]; } atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]); if (msg->retries) atomic_long_add(msg->retries, &port->counters[CM_XMIT_RETRIES][attr_index]); if (cm_id_priv) cm_process_send_error(cm_id_priv, msg, mad_send_wc->status); else cm_free_msg(msg); } static void cm_work_handler(struct work_struct *_work) { struct cm_work *work = container_of(_work, struct cm_work, work.work); int ret; switch (work->cm_event.event) { case IB_CM_REQ_RECEIVED: ret = cm_req_handler(work); break; case IB_CM_MRA_RECEIVED: ret = cm_mra_handler(work); break; case IB_CM_REJ_RECEIVED: ret = cm_rej_handler(work); break; case IB_CM_REP_RECEIVED: ret = cm_rep_handler(work); break; case IB_CM_RTU_RECEIVED: ret = cm_rtu_handler(work); break; case IB_CM_USER_ESTABLISHED: ret = cm_establish_handler(work); break; case IB_CM_DREQ_RECEIVED: ret = cm_dreq_handler(work); break; case IB_CM_DREP_RECEIVED: ret = cm_drep_handler(work); break; case IB_CM_SIDR_REQ_RECEIVED: ret = cm_sidr_req_handler(work); break; case IB_CM_SIDR_REP_RECEIVED: ret = cm_sidr_rep_handler(work); break; case IB_CM_LAP_RECEIVED: ret = cm_lap_handler(work); break; case IB_CM_APR_RECEIVED: ret = cm_apr_handler(work); break; case IB_CM_TIMEWAIT_EXIT: ret = cm_timewait_handler(work); break; default: trace_icm_handler_err(work->cm_event.event); ret = -EINVAL; break; } if (ret) cm_free_work(work); } static int cm_establish(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; struct cm_work *work; unsigned long flags; int ret = 0; struct cm_device *cm_dev; cm_dev = ib_get_client_data(cm_id->device, &cm_client); if (!cm_dev) return -ENODEV; work = kmalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id->state) { case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_id->state = IB_CM_ESTABLISHED; break; case IB_CM_ESTABLISHED: ret = -EISCONN; break; default: trace_icm_establish_err(cm_id); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (ret) { kfree(work); goto out; } /* * The CM worker thread may try to destroy the cm_id before it * can execute this work item. To prevent potential deadlock, * we need to find the cm_id once we're in the context of the * worker thread, rather than holding a reference on it. */ INIT_DELAYED_WORK(&work->work, cm_work_handler); work->local_id = cm_id->local_id; work->remote_id = cm_id->remote_id; work->mad_recv_wc = NULL; work->cm_event.event = IB_CM_USER_ESTABLISHED; /* Check if the device started its remove_one */ spin_lock_irqsave(&cm.lock, flags); if (!cm_dev->going_down) { queue_delayed_work(cm.wq, &work->work, 0); } else { kfree(work); ret = -ENODEV; } spin_unlock_irqrestore(&cm.lock, flags); out: return ret; } static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; unsigned long flags; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_ESTABLISHED && (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; cm_id_priv->av = cm_id_priv->alt_av; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) { int ret; switch (event) { case IB_EVENT_COMM_EST: ret = cm_establish(cm_id); break; case IB_EVENT_PATH_MIG: ret = cm_migrate(cm_id); break; default: ret = -EINVAL; } return ret; } EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct cm_port *port = mad_agent->context; struct cm_work *work; enum ib_cm_event_type event; bool alt_path = false; u16 attr_id; int paths = 0; int going_down = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { case CM_REQ_ATTR_ID: alt_path = cm_req_has_alt_path((struct cm_req_msg *) mad_recv_wc->recv_buf.mad); paths = 1 + (alt_path != 0); event = IB_CM_REQ_RECEIVED; break; case CM_MRA_ATTR_ID: event = IB_CM_MRA_RECEIVED; break; case CM_REJ_ATTR_ID: event = IB_CM_REJ_RECEIVED; break; case CM_REP_ATTR_ID: event = IB_CM_REP_RECEIVED; break; case CM_RTU_ATTR_ID: event = IB_CM_RTU_RECEIVED; break; case CM_DREQ_ATTR_ID: event = IB_CM_DREQ_RECEIVED; break; case CM_DREP_ATTR_ID: event = IB_CM_DREP_RECEIVED; break; case CM_SIDR_REQ_ATTR_ID: event = IB_CM_SIDR_REQ_RECEIVED; break; case CM_SIDR_REP_ATTR_ID: event = IB_CM_SIDR_REP_RECEIVED; break; case CM_LAP_ATTR_ID: paths = 1; event = IB_CM_LAP_RECEIVED; break; case CM_APR_ATTR_ID: event = IB_CM_APR_RECEIVED; break; default: ib_free_recv_mad(mad_recv_wc); return; } attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); atomic_long_inc(&port->counters[CM_RECV][attr_id - CM_ATTR_ID_OFFSET]); work = kmalloc(struct_size(work, path, paths), GFP_KERNEL); if (!work) { ib_free_recv_mad(mad_recv_wc); return; } INIT_DELAYED_WORK(&work->work, cm_work_handler); work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; work->port = port; /* Check if the device started its remove_one */ spin_lock_irq(&cm.lock); if (!port->cm_dev->going_down) queue_delayed_work(cm.wq, &work->work, 0); else going_down = 1; spin_unlock_irq(&cm.lock); if (going_down) { kfree(work); ib_free_recv_mad(mad_recv_wc); } } static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; if (cm_id_priv->responder_resources) { struct ib_device *ib_dev = cm_id_priv->id.device; u64 support_flush = ib_dev->attrs.device_cap_flags & (IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT); u32 flushable = support_flush ? (IB_ACCESS_FLUSH_GLOBAL | IB_ACCESS_FLUSH_PERSISTENT) : 0; qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC | flushable; } qp_attr->pkey_index = cm_id_priv->av.pkey_index; if (cm_id_priv->av.port) qp_attr->port_num = cm_id_priv->av.port->port_num; ret = 0; break; default: trace_icm_qp_init_err(&cm_id_priv->id); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; if ((qp_attr->ah_attr.type == RDMA_AH_ATTR_TYPE_IB) && cm_id_priv->av.dlid_datapath && (cm_id_priv->av.dlid_datapath != 0xffff)) qp_attr->ah_attr.ib.dlid = cm_id_priv->av.dlid_datapath; qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); if (cm_id_priv->qp_type == IB_QPT_RC || cm_id_priv->qp_type == IB_QPT_XRC_TGT) { *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; qp_attr->max_dest_rd_atomic = cm_id_priv->responder_resources; qp_attr->min_rnr_timer = 0; } if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr) && cm_id_priv->alt_av.port) { *qp_attr_mask |= IB_QP_ALT_PATH; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; } ret = 0; break; default: trace_icm_qp_rtr_err(&cm_id_priv->id); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { /* Allow transition to RTS before sending REP */ case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) { *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN; qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn); switch (cm_id_priv->qp_type) { case IB_QPT_RC: case IB_QPT_XRC_INI: *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC; qp_attr->retry_cnt = cm_id_priv->retry_count; qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; fallthrough; case IB_QPT_XRC_TGT: *qp_attr_mask |= IB_QP_TIMEOUT; qp_attr->timeout = cm_id_priv->av.timeout; break; default: break; } if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) { *qp_attr_mask |= IB_QP_PATH_MIG_STATE; qp_attr->path_mig_state = IB_MIG_REARM; } } else { *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; if (cm_id_priv->alt_av.port) qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; qp_attr->path_mig_state = IB_MIG_REARM; } ret = 0; break; default: trace_icm_qp_rts_err(&cm_id_priv->id); ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct cm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct cm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTR: ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(ib_cm_init_qp_attr); static ssize_t cm_show_counter(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf) { struct cm_counter_attribute *cm_attr = container_of(attr, struct cm_counter_attribute, attr); struct cm_device *cm_dev = ib_get_client_data(ibdev, &cm_client); if (WARN_ON(!cm_dev)) return -EINVAL; return sysfs_emit( buf, "%ld\n", atomic_long_read( &cm_dev->port[port_num - 1] ->counters[cm_attr->group][cm_attr->index])); } #define CM_COUNTER_ATTR(_name, _group, _index) \ { \ .attr = __ATTR(_name, 0444, cm_show_counter, NULL), \ .group = _group, .index = _index \ } #define CM_COUNTER_GROUP(_group, _name) \ static struct cm_counter_attribute cm_counter_attr_##_group[] = { \ CM_COUNTER_ATTR(req, _group, CM_REQ_COUNTER), \ CM_COUNTER_ATTR(mra, _group, CM_MRA_COUNTER), \ CM_COUNTER_ATTR(rej, _group, CM_REJ_COUNTER), \ CM_COUNTER_ATTR(rep, _group, CM_REP_COUNTER), \ CM_COUNTER_ATTR(rtu, _group, CM_RTU_COUNTER), \ CM_COUNTER_ATTR(dreq, _group, CM_DREQ_COUNTER), \ CM_COUNTER_ATTR(drep, _group, CM_DREP_COUNTER), \ CM_COUNTER_ATTR(sidr_req, _group, CM_SIDR_REQ_COUNTER), \ CM_COUNTER_ATTR(sidr_rep, _group, CM_SIDR_REP_COUNTER), \ CM_COUNTER_ATTR(lap, _group, CM_LAP_COUNTER), \ CM_COUNTER_ATTR(apr, _group, CM_APR_COUNTER), \ }; \ static struct attribute *cm_counter_attrs_##_group[] = { \ &cm_counter_attr_##_group[0].attr.attr, \ &cm_counter_attr_##_group[1].attr.attr, \ &cm_counter_attr_##_group[2].attr.attr, \ &cm_counter_attr_##_group[3].attr.attr, \ &cm_counter_attr_##_group[4].attr.attr, \ &cm_counter_attr_##_group[5].attr.attr, \ &cm_counter_attr_##_group[6].attr.attr, \ &cm_counter_attr_##_group[7].attr.attr, \ &cm_counter_attr_##_group[8].attr.attr, \ &cm_counter_attr_##_group[9].attr.attr, \ &cm_counter_attr_##_group[10].attr.attr, \ NULL, \ }; \ static const struct attribute_group cm_counter_group_##_group = { \ .name = _name, \ .attrs = cm_counter_attrs_##_group, \ }; CM_COUNTER_GROUP(CM_XMIT, "cm_tx_msgs") CM_COUNTER_GROUP(CM_XMIT_RETRIES, "cm_tx_retries") CM_COUNTER_GROUP(CM_RECV, "cm_rx_msgs") CM_COUNTER_GROUP(CM_RECV_DUPLICATES, "cm_rx_duplicates") static const struct attribute_group *cm_counter_groups[] = { &cm_counter_group_CM_XMIT, &cm_counter_group_CM_XMIT_RETRIES, &cm_counter_group_CM_RECV, &cm_counter_group_CM_RECV_DUPLICATES, NULL, }; static int cm_add_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; struct ib_mad_reg_req reg_req = { .mgmt_class = IB_MGMT_CLASS_CM, .mgmt_class_version = IB_CM_CLASS_VERSION, }; struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; int ret; int count = 0; u32 i; cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt), GFP_KERNEL); if (!cm_dev) return -ENOMEM; kref_init(&cm_dev->kref); rwlock_init(&cm_dev->mad_agent_lock); cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; ib_set_client_data(ib_device, &cm_client, cm_dev); set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); rdma_for_each_port (ib_device, i) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = kzalloc(sizeof *port, GFP_KERNEL); if (!port) { ret = -ENOMEM; goto error1; } cm_dev->port[i-1] = port; port->cm_dev = cm_dev; port->port_num = i; ret = ib_port_register_client_groups(ib_device, i, cm_counter_groups); if (ret) goto error1; port->mad_agent = ib_register_mad_agent(ib_device, i, IB_QPT_GSI, ®_req, 0, cm_send_handler, cm_recv_handler, port, 0); if (IS_ERR(port->mad_agent)) { ret = PTR_ERR(port->mad_agent); goto error2; } port->rep_agent = ib_register_mad_agent(ib_device, i, IB_QPT_GSI, NULL, 0, cm_send_handler, NULL, port, 0); if (IS_ERR(port->rep_agent)) { ret = PTR_ERR(port->rep_agent); goto error3; } ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) goto error4; count++; } if (!count) { ret = -EOPNOTSUPP; goto free; } write_lock_irqsave(&cm.device_lock, flags); list_add_tail(&cm_dev->list, &cm.device_list); write_unlock_irqrestore(&cm.device_lock, flags); return 0; error4: ib_unregister_mad_agent(port->rep_agent); error3: ib_unregister_mad_agent(port->mad_agent); error2: ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); error1: port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->rep_agent); ib_unregister_mad_agent(port->mad_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); } free: cm_device_put(cm_dev); return ret; } static void cm_remove_one(struct ib_device *ib_device, void *client_data) { struct cm_device *cm_dev = client_data; struct cm_port *port; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; u32 i; write_lock_irqsave(&cm.device_lock, flags); list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); spin_lock_irq(&cm.lock); cm_dev->going_down = 1; spin_unlock_irq(&cm.lock); rdma_for_each_port (ib_device, i) { struct ib_mad_agent *mad_agent; struct ib_mad_agent *rep_agent; if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; mad_agent = port->mad_agent; rep_agent = port->rep_agent; ib_modify_port(ib_device, port->port_num, 0, &port_modify); /* * We flush the queue here after the going_down set, this * verify that no new works will be queued in the recv handler, * after that we can call the unregister_mad_agent */ flush_workqueue(cm.wq); /* * The above ensures no call paths from the work are running, * the remaining paths all take the mad_agent_lock. */ write_lock(&cm_dev->mad_agent_lock); port->mad_agent = NULL; port->rep_agent = NULL; write_unlock(&cm_dev->mad_agent_lock); ib_unregister_mad_agent(mad_agent); ib_unregister_mad_agent(rep_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); } cm_device_put(cm_dev); } static int __init ib_cm_init(void) { int ret; INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; cm.remote_qp_table = RB_ROOT; cm.remote_sidr_table = RB_ROOT; xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); cm.wq = alloc_workqueue("ib_cm", 0, 1); if (!cm.wq) { ret = -ENOMEM; goto error2; } ret = ib_register_client(&cm_client); if (ret) goto error3; return 0; error3: destroy_workqueue(cm.wq); error2: return ret; } static void __exit ib_cm_cleanup(void) { struct cm_timewait_info *timewait_info, *tmp; spin_lock_irq(&cm.lock); list_for_each_entry(timewait_info, &cm.timewait_list, list) cancel_delayed_work(&timewait_info->work.work); spin_unlock_irq(&cm.lock); ib_unregister_client(&cm_client); destroy_workqueue(cm.wq); list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) { list_del(&timewait_info->list); kfree(timewait_info); } WARN_ON(!xa_empty(&cm.local_id_table)); } module_init(ib_cm_init); module_exit(ib_cm_cleanup); |
| 1927 1931 31 1920 1926 908 1920 1907 1902 1894 1922 980 1928 131 1925 31 1925 1278 1175 122 1175 6 646 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IP/TCP/UDP checksumming routines * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Tom May, <ftom@netcom.com> * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> * Lots of code moved from tcp.c and ip.c; see those files * for more names. * * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: * Fixed some nasty bugs, causing some horrible crashes. * A: At some points, the sum (%0) was used as * length-counter instead of the length counter * (%1). Thanks to Roman Hodek for pointing this out. * B: GCC seems to mess up if one uses too many * data-registers to hold input values and one tries to * specify d0 and d1 as scratch registers. Letting gcc * choose these registers itself solves the problem. */ /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access kills, so most of the assembly has to go. */ #include <linux/export.h> #include <net/checksum.h> #include <asm/byteorder.h> #ifndef do_csum static unsigned int do_csum(const unsigned char *buff, int len) { int odd; unsigned int result = 0; if (len <= 0) goto out; odd = 1 & (unsigned long) buff; if (odd) { #ifdef __LITTLE_ENDIAN result += (*buff << 8); #else result = *buff; #endif len--; buff++; } if (len >= 2) { if (2 & (unsigned long) buff) { result += *(unsigned short *) buff; len -= 2; buff += 2; } if (len >= 4) { const unsigned char *end = buff + ((unsigned)len & ~3); unsigned int carry = 0; do { unsigned int w = *(unsigned int *) buff; buff += 4; result += carry; result += w; carry = (w > result); } while (buff < end); result += carry; result = (result & 0xffff) + (result >> 16); } if (len & 2) { result += *(unsigned short *) buff; buff += 2; } } if (len & 1) #ifdef __LITTLE_ENDIAN result += *buff; #else result += (*buff << 8); #endif result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: return result; } #endif #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { return (__force __sum16)~do_csum(iph, ihl*4); } EXPORT_SYMBOL(ip_fast_csum); #endif /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ __wsum csum_partial(const void *buff, int len, __wsum wsum) { unsigned int sum = (__force unsigned int)wsum; unsigned int result = do_csum(buff, len); /* add in old sum, and carry.. */ result += sum; if (sum > result) result += 1; return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ __sum16 ip_compute_csum(const void *buff, int len) { return (__force __sum16)~do_csum(buff, len); } EXPORT_SYMBOL(ip_compute_csum); #ifndef csum_tcpudp_nofold static inline u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); /* add up carry.. */ x = (x & 0xffffffff) + (x >> 32); return (u32)x; } __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { unsigned long long s = (__force u32)sum; s += (__force u32)saddr; s += (__force u32)daddr; #ifdef __BIG_ENDIAN s += proto + len; #else s += (proto + len) << 8; #endif return (__force __wsum)from64to32(s); } EXPORT_SYMBOL(csum_tcpudp_nofold); #endif |
| 2 2 35 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 2 2 2 1 2 3 2 3 3 3 1 1 1 1 33 33 33 33 33 33 33 33 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, .name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, }; static int devlink_param_generic_verify(const struct devlink_param *param) { /* verify it match generic parameter by id and name */ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) return -EINVAL; if (strcmp(param->name, devlink_param_generic[param->id].name)) return -ENOENT; WARN_ON(param->type != devlink_param_generic[param->id].type); return 0; } static int devlink_param_driver_verify(const struct devlink_param *param) { int i; if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) return -EINVAL; /* verify no such name in generic params */ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) if (!strcmp(param->name, devlink_param_generic[i].name)) return -EEXIST; return 0; } static struct devlink_param_item * devlink_param_find_by_name(struct xarray *params, const char *param_name) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(params, param_id, param_item) { if (!strcmp(param_item->param->name, param_name)) return param_item; } return NULL; } static struct devlink_param_item * devlink_param_find_by_id(struct xarray *params, u32 param_id) { return xa_load(params, param_id); } static bool devlink_param_cmode_is_supported(const struct devlink_param *param, enum devlink_param_cmode cmode) { return test_bit(cmode, ¶m->supported_cmodes); } static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { if (!param->get) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack) { if (!param->set) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx, extack); } static int devlink_nl_param_value_fill_one(struct sk_buff *msg, enum devlink_param_type type, enum devlink_param_cmode cmode, union devlink_param_value val) { struct nlattr *param_value_attr; param_value_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM_VALUE); if (!param_value_attr) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) goto value_nest_cancel; switch (type) { case DEVLINK_PARAM_TYPE_U8: if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U16: if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U32: if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U64: if (devlink_nl_put_u64(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu64)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_STRING: if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vstr)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_BOOL: if (val.vbool && nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) goto value_nest_cancel; break; } nla_nest_end(msg, param_value_attr); return 0; value_nest_cancel: nla_nest_cancel(msg, param_value_attr); nla_put_failure: return -EMSGSIZE; } static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; const struct devlink_param *param = param_item->param; struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; struct nlattr *param_attr; void *hdr; int err; int i; /* Get value from driver part to driverinit configuration mode */ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { if (!devlink_param_cmode_is_supported(param, i)) continue; if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { if (param_item->driverinit_value_new_valid) param_value[i] = param_item->driverinit_value_new; else if (param_item->driverinit_value_valid) param_value[i] = param_item->driverinit_value; else return -EOPNOTSUPP; } else { ctx.cmode = i; err = devlink_param_get(devlink, param, &ctx); if (err) return err; param_value[i] = ctx.val; } param_value_set[i] = true; } hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; if (cmd == DEVLINK_CMD_PORT_PARAM_GET || cmd == DEVLINK_CMD_PORT_PARAM_NEW || cmd == DEVLINK_CMD_PORT_PARAM_DEL) if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) goto genlmsg_cancel; param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM); if (!param_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) goto param_nest_cancel; if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) goto param_nest_cancel; if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type)) goto param_nest_cancel; param_values_list = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM_VALUES_LIST); if (!param_values_list) goto param_nest_cancel; for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { if (!param_value_set[i]) continue; err = devlink_nl_param_value_fill_one(msg, param->type, i, param_value[i]); if (err) goto values_list_nest_cancel; } nla_nest_end(msg, param_values_list); nla_nest_end(msg, param_attr); genlmsg_end(msg, hdr); return 0; values_list_nest_cancel: nla_nest_end(msg, param_values_list); param_nest_cancel: nla_nest_cancel(msg, param_attr); genlmsg_cancel: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_param_notify(struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && cmd != DEVLINK_CMD_PORT_PARAM_NEW && cmd != DEVLINK_CMD_PORT_PARAM_DEL); /* devlink_notify_register() / devlink_notify_unregister() * will replay the notifications if the params are added/removed * outside of the lifetime of the instance. */ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } static void devlink_params_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(&devlink->params, param_id, param_item) devlink_param_notify(devlink, 0, param_item, cmd); } void devlink_params_notify_register(struct devlink *devlink) { devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW); } void devlink_params_notify_unregister(struct devlink *devlink) { devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL); } static int devlink_nl_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; unsigned long param_id; int err = 0; xa_for_each_start(&devlink->params, param_id, param_item, state->idx) { err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { state->idx = param_id; break; } } return err; } int devlink_nl_param_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one); } static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) { if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]); return 0; } static int devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { struct nlattr *param_data; int len; param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) return -EINVAL; switch (param->type) { case DEVLINK_PARAM_TYPE_U8: if (nla_len(param_data) != sizeof(u8)) return -EINVAL; value->vu8 = nla_get_u8(param_data); break; case DEVLINK_PARAM_TYPE_U16: if (nla_len(param_data) != sizeof(u16)) return -EINVAL; value->vu16 = nla_get_u16(param_data); break; case DEVLINK_PARAM_TYPE_U32: if (nla_len(param_data) != sizeof(u32)) return -EINVAL; value->vu32 = nla_get_u32(param_data); break; case DEVLINK_PARAM_TYPE_U64: if (nla_len(param_data) != sizeof(u64)) return -EINVAL; value->vu64 = nla_get_u64(param_data); break; case DEVLINK_PARAM_TYPE_STRING: len = strnlen(nla_data(param_data), nla_len(param_data)); if (len == nla_len(param_data) || len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; strcpy(value->vstr, nla_data(param_data)); break; case DEVLINK_PARAM_TYPE_BOOL: if (param_data && nla_len(param_data)) return -EINVAL; value->vbool = nla_get_flag(param_data); break; } return 0; } static struct devlink_param_item * devlink_param_get_from_info(struct xarray *params, struct genl_info *info) { char *param_name; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); return devlink_param_find_by_name(params, param_name); } int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; param_item = devlink_param_get_from_info(&devlink->params, info); if (!param_item) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, unsigned int port_index, struct xarray *params, struct genl_info *info, enum devlink_command cmd) { enum devlink_param_type param_type; struct devlink_param_gset_ctx ctx; enum devlink_param_cmode cmode; struct devlink_param_item *param_item; const struct devlink_param *param; union devlink_param_value value; int err = 0; param_item = devlink_param_get_from_info(params, info); if (!param_item) return -EINVAL; param = param_item->param; err = devlink_param_type_get_from_info(info, ¶m_type); if (err) return err; if (param_type != param->type) return -EINVAL; err = devlink_param_value_get_from_info(param, info, &value); if (err) return err; if (param->validate) { err = param->validate(devlink, param->id, value, info->extack); if (err) return err; } if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) return -EINVAL; cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); if (!devlink_param_cmode_is_supported(param, cmode)) return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { param_item->driverinit_value_new = value; param_item->driverinit_value_new_valid = true; } else { if (!param->set) return -EOPNOTSUPP; ctx.val = value; ctx.cmode = cmode; err = devlink_param_set(devlink, param, &ctx, info->extack); if (err) return err; } devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } int devlink_nl_param_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params, info, DEVLINK_CMD_PARAM_NEW); } int devlink_nl_port_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { NL_SET_ERR_MSG(cb->extack, "Port params are not supported"); return msg->len; } int devlink_nl_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } int devlink_nl_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } static int devlink_param_verify(const struct devlink_param *param) { if (!param || !param->name || !param->supported_cmodes) return -EINVAL; if (param->generic) return devlink_param_generic_verify(param); else return devlink_param_driver_verify(param); } static int devlink_param_register(struct devlink *devlink, const struct devlink_param *param) { struct devlink_param_item *param_item; int err; WARN_ON(devlink_param_verify(param)); WARN_ON(devlink_param_find_by_name(&devlink->params, param->name)); if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) WARN_ON(param->get || param->set); else WARN_ON(!param->get || !param->set); param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); if (!param_item) return -ENOMEM; param_item->param = param; err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL); if (err) goto err_xa_insert; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); return 0; err_xa_insert: kfree(param_item); return err; } static void devlink_param_unregister(struct devlink *devlink, const struct devlink_param *param) { struct devlink_param_item *param_item; param_item = devlink_param_find_by_id(&devlink->params, param->id); if (WARN_ON(!param_item)) return; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); xa_erase(&devlink->params, param->id); kfree(param_item); } /** * devl_params_register - register configuration parameters * * @devlink: devlink * @params: configuration parameters array * @params_count: number of parameters provided * * Register the configuration parameters supported by the driver. */ int devl_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { const struct devlink_param *param = params; int i, err; lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) { err = devlink_param_register(devlink, param); if (err) goto rollback; } return 0; rollback: if (!i) return err; for (param--; i > 0; i--, param--) devlink_param_unregister(devlink, param); return err; } EXPORT_SYMBOL_GPL(devl_params_register); int devlink_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { int err; devl_lock(devlink); err = devl_params_register(devlink, params, params_count); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_params_register); /** * devl_params_unregister - unregister configuration parameters * @devlink: devlink * @params: configuration parameters to unregister * @params_count: number of parameters provided */ void devl_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { const struct devlink_param *param = params; int i; lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) devlink_param_unregister(devlink, param); } EXPORT_SYMBOL_GPL(devl_params_unregister); void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { devl_lock(devlink); devl_params_unregister(devlink, params, params_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_params_unregister); /** * devl_param_driverinit_value_get - get configuration parameter * value for driver initializing * * @devlink: devlink * @param_id: parameter ID * @val: pointer to store the value of parameter in driverinit * configuration mode * * This function should be used by the driver to get driverinit * configuration for initialization after reload command. * * Note that lockless call of this function relies on the * driver to maintain following basic sane behavior: * 1) Driver ensures a call to this function cannot race with * registering/unregistering the parameter with the same parameter ID. * 2) Driver ensures a call to this function cannot race with * devl_param_driverinit_value_set() call with the same parameter ID. * 3) Driver ensures a call to this function cannot race with * reload operation. * If the driver is not able to comply, it has to take the devlink->lock * while calling this. */ int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *val) { struct devlink_param_item *param_item; if (WARN_ON(!devlink_reload_supported(devlink->ops))) return -EOPNOTSUPP; param_item = devlink_param_find_by_id(&devlink->params, param_id); if (!param_item) return -EINVAL; if (!param_item->driverinit_value_valid) return -EOPNOTSUPP; if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT))) return -EOPNOTSUPP; *val = param_item->driverinit_value; return 0; } EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get); /** * devl_param_driverinit_value_set - set value of configuration * parameter for driverinit * configuration mode * * @devlink: devlink * @param_id: parameter ID * @init_val: value of parameter to set for driverinit configuration mode * * This function should be used by the driver to set driverinit * configuration mode default value. */ void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val) { struct devlink_param_item *param_item; devl_assert_locked(devlink); param_item = devlink_param_find_by_id(&devlink->params, param_id); if (WARN_ON(!param_item)) return; if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT))) return; param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set); void devlink_params_driverinit_load_new(struct devlink *devlink) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(&devlink->params, param_id, param_item) { if (!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT) || !param_item->driverinit_value_new_valid) continue; param_item->driverinit_value = param_item->driverinit_value_new; param_item->driverinit_value_valid = true; param_item->driverinit_value_new_valid = false; } } /** * devl_param_value_changed - notify devlink on a parameter's value * change. Should be called by the driver * right after the change. * * @devlink: devlink * @param_id: parameter ID * * This function should be used by the driver to notify devlink on value * change, excluding driverinit configuration mode. * For driverinit configuration mode driver should use the function */ void devl_param_value_changed(struct devlink *devlink, u32 param_id) { struct devlink_param_item *param_item; param_item = devlink_param_find_by_id(&devlink->params, param_id); WARN_ON(!param_item); devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devl_param_value_changed); |
| 2 2 2 2 2 2 2 2 29 29 29 29 29 29 29 29 29 29 29 28 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/svc.c * * High-level RPC service routines * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> * * Multiple threads pools and NUMAisation * Copyright (c) 2006 Silicon Graphics, Inc. * by Greg Banks <gnb@melbourne.sgi.com> */ #include <linux/linkage.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/net.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/bc_xprt.h> #include <trace/events/sunrpc.h> #include "fail.h" #include "sunrpc.h" #define RPCDBG_FACILITY RPCDBG_SVCDSP static void svc_unregister(const struct svc_serv *serv, struct net *net); #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL /* * Mode for mapping cpus to pools. */ enum { SVC_POOL_AUTO = -1, /* choose one of the others */ SVC_POOL_GLOBAL, /* no mapping, just a single global pool * (legacy & UP mode) */ SVC_POOL_PERCPU, /* one pool per cpu */ SVC_POOL_PERNODE /* one pool per numa node */ }; /* * Structure for mapping cpus to pools and vice versa. * Setup once during sunrpc initialisation. */ struct svc_pool_map { int count; /* How many svc_servs use us */ int mode; /* Note: int not enum to avoid * warnings about "enumeration value * not handled in switch" */ unsigned int npools; unsigned int *pool_to; /* maps pool id to cpu or node */ unsigned int *to_pool; /* maps cpu or node to pool id */ }; static struct svc_pool_map svc_pool_map = { .mode = SVC_POOL_DEFAULT }; static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int __param_set_pool_mode(const char *val, struct svc_pool_map *m) { int err, mode; mutex_lock(&svc_pool_map_mutex); err = 0; if (!strncmp(val, "auto", 4)) mode = SVC_POOL_AUTO; else if (!strncmp(val, "global", 6)) mode = SVC_POOL_GLOBAL; else if (!strncmp(val, "percpu", 6)) mode = SVC_POOL_PERCPU; else if (!strncmp(val, "pernode", 7)) mode = SVC_POOL_PERNODE; else err = -EINVAL; if (err) goto out; if (m->count == 0) m->mode = mode; else if (mode != m->mode) err = -EBUSY; out: mutex_unlock(&svc_pool_map_mutex); return err; } static int param_set_pool_mode(const char *val, const struct kernel_param *kp) { struct svc_pool_map *m = kp->arg; return __param_set_pool_mode(val, m); } int sunrpc_set_pool_mode(const char *val) { return __param_set_pool_mode(val, &svc_pool_map); } EXPORT_SYMBOL(sunrpc_set_pool_mode); /** * sunrpc_get_pool_mode - get the current pool_mode for the host * @buf: where to write the current pool_mode * @size: size of @buf * * Grab the current pool_mode from the svc_pool_map and write * the resulting string to @buf. Returns the number of characters * written to @buf (a'la snprintf()). */ int sunrpc_get_pool_mode(char *buf, size_t size) { struct svc_pool_map *m = &svc_pool_map; switch (m->mode) { case SVC_POOL_AUTO: return snprintf(buf, size, "auto"); case SVC_POOL_GLOBAL: return snprintf(buf, size, "global"); case SVC_POOL_PERCPU: return snprintf(buf, size, "percpu"); case SVC_POOL_PERNODE: return snprintf(buf, size, "pernode"); default: return snprintf(buf, size, "%d", m->mode); } } EXPORT_SYMBOL(sunrpc_get_pool_mode); static int param_get_pool_mode(char *buf, const struct kernel_param *kp) { char str[16]; int len; len = sunrpc_get_pool_mode(str, ARRAY_SIZE(str)); /* Ensure we have room for newline and NUL */ len = min_t(int, len, ARRAY_SIZE(str) - 2); /* tack on the newline */ str[len] = '\n'; str[len + 1] = '\0'; return sysfs_emit(buf, "%s", str); } module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode, &svc_pool_map, 0644); /* * Detect best pool mapping mode heuristically, * according to the machine's topology. */ static int svc_pool_map_choose_mode(void) { unsigned int node; if (nr_online_nodes > 1) { /* * Actually have multiple NUMA nodes, * so split pools on NUMA node boundaries */ return SVC_POOL_PERNODE; } node = first_online_node; if (nr_cpus_node(node) > 2) { /* * Non-trivial SMP, or CONFIG_NUMA on * non-NUMA hardware, e.g. with a generic * x86_64 kernel on Xeons. In this case we * want to divide the pools on cpu boundaries. */ return SVC_POOL_PERCPU; } /* default: one global pool */ return SVC_POOL_GLOBAL; } /* * Allocate the to_pool[] and pool_to[] arrays. * Returns 0 on success or an errno. */ static int svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools) { m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); if (!m->to_pool) goto fail; m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); if (!m->pool_to) goto fail_free; return 0; fail_free: kfree(m->to_pool); m->to_pool = NULL; fail: return -ENOMEM; } /* * Initialise the pool map for SVC_POOL_PERCPU mode. * Returns number of pools or <0 on error. */ static int svc_pool_map_init_percpu(struct svc_pool_map *m) { unsigned int maxpools = nr_cpu_ids; unsigned int pidx = 0; unsigned int cpu; int err; err = svc_pool_map_alloc_arrays(m, maxpools); if (err) return err; for_each_online_cpu(cpu) { BUG_ON(pidx >= maxpools); m->to_pool[cpu] = pidx; m->pool_to[pidx] = cpu; pidx++; } /* cpus brought online later all get mapped to pool0, sorry */ return pidx; }; /* * Initialise the pool map for SVC_POOL_PERNODE mode. * Returns number of pools or <0 on error. */ static int svc_pool_map_init_pernode(struct svc_pool_map *m) { unsigned int maxpools = nr_node_ids; unsigned int pidx = 0; unsigned int node; int err; err = svc_pool_map_alloc_arrays(m, maxpools); if (err) return err; for_each_node_with_cpus(node) { /* some architectures (e.g. SN2) have cpuless nodes */ BUG_ON(pidx > maxpools); m->to_pool[node] = pidx; m->pool_to[pidx] = node; pidx++; } /* nodes brought online later all get mapped to pool0, sorry */ return pidx; } /* * Add a reference to the global map of cpus to pools (and * vice versa) if pools are in use. * Initialise the map if we're the first user. * Returns the number of pools. If this is '1', no reference * was taken. */ static unsigned int svc_pool_map_get(void) { struct svc_pool_map *m = &svc_pool_map; int npools = -1; mutex_lock(&svc_pool_map_mutex); if (m->count++) { mutex_unlock(&svc_pool_map_mutex); return m->npools; } if (m->mode == SVC_POOL_AUTO) m->mode = svc_pool_map_choose_mode(); switch (m->mode) { case SVC_POOL_PERCPU: npools = svc_pool_map_init_percpu(m); break; case SVC_POOL_PERNODE: npools = svc_pool_map_init_pernode(m); break; } if (npools <= 0) { /* default, or memory allocation failure */ npools = 1; m->mode = SVC_POOL_GLOBAL; } m->npools = npools; mutex_unlock(&svc_pool_map_mutex); return npools; } /* * Drop a reference to the global map of cpus to pools. * When the last reference is dropped, the map data is * freed; this allows the sysadmin to change the pool. */ static void svc_pool_map_put(void) { struct svc_pool_map *m = &svc_pool_map; mutex_lock(&svc_pool_map_mutex); if (!--m->count) { kfree(m->to_pool); m->to_pool = NULL; kfree(m->pool_to); m->pool_to = NULL; m->npools = 0; } mutex_unlock(&svc_pool_map_mutex); } static int svc_pool_map_get_node(unsigned int pidx) { const struct svc_pool_map *m = &svc_pool_map; if (m->count) { if (m->mode == SVC_POOL_PERCPU) return cpu_to_node(m->pool_to[pidx]); if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } return NUMA_NO_NODE; } /* * Set the given thread's cpus_allowed mask so that it * will only run on cpus in the given pool. */ static inline void svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) { struct svc_pool_map *m = &svc_pool_map; unsigned int node = m->pool_to[pidx]; /* * The caller checks for sv_nrpools > 1, which * implies that we've been initialized. */ WARN_ON_ONCE(m->count == 0); if (m->count == 0) return; switch (m->mode) { case SVC_POOL_PERCPU: { set_cpus_allowed_ptr(task, cpumask_of(node)); break; } case SVC_POOL_PERNODE: { set_cpus_allowed_ptr(task, cpumask_of_node(node)); break; } } } /** * svc_pool_for_cpu - Select pool to run a thread on this cpu * @serv: An RPC service * * Use the active CPU and the svc_pool_map's mode setting to * select the svc thread pool to use. Once initialized, the * svc_pool_map does not change. * * Return value: * A pointer to an svc_pool */ struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv) { struct svc_pool_map *m = &svc_pool_map; int cpu = raw_smp_processor_id(); unsigned int pidx = 0; if (serv->sv_nrpools <= 1) return serv->sv_pools; switch (m->mode) { case SVC_POOL_PERCPU: pidx = m->to_pool[cpu]; break; case SVC_POOL_PERNODE: pidx = m->to_pool[cpu_to_node(cpu)]; break; } return &serv->sv_pools[pidx % serv->sv_nrpools]; } static int svc_rpcb_setup(struct svc_serv *serv, struct net *net) { int err; err = rpcb_create_local(net); if (err) return err; /* Remove any stale portmap registrations */ svc_unregister(serv, net); return 0; } void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) { svc_unregister(serv, net); rpcb_put_local(net); } EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { unsigned int p, i; for (p = 0; p < serv->sv_nprogs; p++) { struct svc_program *progp = &serv->sv_programs[p]; for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; if (!progp->pg_vers[i]->vs_hidden) return 1; } } return 0; } int svc_bind(struct svc_serv *serv, struct net *net) { if (!svc_uses_rpcbind(serv)) return 0; return svc_rpcb_setup(serv, net); } EXPORT_SYMBOL_GPL(svc_bind); #if defined(CONFIG_SUNRPC_BACKCHANNEL) static void __svc_init_bc(struct svc_serv *serv) { lwq_init(&serv->sv_cb_list); } #else static void __svc_init_bc(struct svc_serv *serv) { } #endif /* * Create an RPC service */ static struct svc_serv * __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats, unsigned int bufsize, int npools, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; unsigned int xdrsize; unsigned int i; if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; serv->sv_name = prog->pg_name; serv->sv_programs = prog; serv->sv_nprogs = nprogs; serv->sv_stats = stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); serv->sv_threadfn = threadfn; xdrsize = 0; for (i = 0; i < nprogs; i++) { struct svc_program *progp = &prog[i]; progp->pg_lovers = progp->pg_nvers-1; for (vers = 0; vers < progp->pg_nvers ; vers++) if (progp->pg_vers[vers]) { progp->pg_hivers = vers; if (progp->pg_lovers > vers) progp->pg_lovers = vers; if (progp->pg_vers[vers]->vs_xdrsize > xdrsize) xdrsize = progp->pg_vers[vers]->vs_xdrsize; } } serv->sv_xdrsize = xdrsize; INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); timer_setup(&serv->sv_temptimer, NULL, 0); spin_lock_init(&serv->sv_lock); __svc_init_bc(serv); serv->sv_nrpools = npools; serv->sv_pools = kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), GFP_KERNEL); if (!serv->sv_pools) { kfree(serv); return NULL; } for (i = 0; i < serv->sv_nrpools; i++) { struct svc_pool *pool = &serv->sv_pools[i]; dprintk("svc: initialising pool %u for %s\n", i, serv->sv_name); pool->sp_id = i; lwq_init(&pool->sp_xprts); INIT_LIST_HEAD(&pool->sp_all_threads); init_llist_head(&pool->sp_idle_threads); percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); } return serv; } /** * svc_create - Create an RPC service * @prog: the RPC program the new service will handle * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, int (*threadfn)(void *data)) { return __svc_create(prog, 1, NULL, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); /** * svc_create_pooled - Create an RPC service with pooled threads * @prog: Array of RPC programs the new service will handle * @nprogs: Number of programs in the array * @stats: the stats struct if desired * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create_pooled(struct svc_program *prog, unsigned int nprogs, struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); serv = __svc_create(prog, nprogs, stats, bufsize, npools, threadfn); if (!serv) goto out_err; serv->sv_is_pooled = true; return serv; out_err: svc_pool_map_put(); return NULL; } EXPORT_SYMBOL_GPL(svc_create_pooled); /* * Destroy an RPC service. Should be called with appropriate locking to * protect sv_permsocks and sv_tempsocks. */ void svc_destroy(struct svc_serv **servp) { struct svc_serv *serv = *servp; unsigned int i; *servp = NULL; dprintk("svc: svc_destroy(%s)\n", serv->sv_programs->pg_name); timer_shutdown_sync(&serv->sv_temptimer); /* * Remaining transports at this point are not expected. */ WARN_ONCE(!list_empty(&serv->sv_permsocks), "SVC: permsocks remain for %s\n", serv->sv_programs->pg_name); WARN_ONCE(!list_empty(&serv->sv_tempsocks), "SVC: tempsocks remain for %s\n", serv->sv_programs->pg_name); cache_clean_deferred(serv); if (serv->sv_is_pooled) svc_pool_map_put(); for (i = 0; i < serv->sv_nrpools; i++) { struct svc_pool *pool = &serv->sv_pools[i]; percpu_counter_destroy(&pool->sp_messages_arrived); percpu_counter_destroy(&pool->sp_sockets_queued); percpu_counter_destroy(&pool->sp_threads_woken); } kfree(serv->sv_pools); kfree(serv); } EXPORT_SYMBOL_GPL(svc_destroy); static bool svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { rqstp->rq_maxpages = svc_serv_maxpages(serv); /* rq_pages' last entry is NULL for historical reasons. */ rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, sizeof(struct page *), GFP_KERNEL, node); if (!rqstp->rq_pages) return false; return true; } /* * Release an RPC server buffer */ static void svc_release_buffer(struct svc_rqst *rqstp) { unsigned long i; for (i = 0; i < rqstp->rq_maxpages; i++) if (rqstp->rq_pages[i]) put_page(rqstp->rq_pages[i]); kfree(rqstp->rq_pages); } static void svc_rqst_free(struct svc_rqst *rqstp) { folio_batch_release(&rqstp->rq_fbatch); kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); if (rqstp->rq_scratch_page) put_page(rqstp->rq_scratch_page); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); kfree_rcu(rqstp, rq_rcu_head); } static struct svc_rqst * svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node); if (!rqstp) return rqstp; folio_batch_init(&rqstp->rq_fbatch); rqstp->rq_server = serv; rqstp->rq_pool = pool; rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); if (!rqstp->rq_scratch_page) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) goto out_enomem; rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_resp) goto out_enomem; if (!svc_init_buffer(rqstp, serv, node)) goto out_enomem; rqstp->rq_bvec = kcalloc_node(rqstp->rq_maxpages, sizeof(struct bio_vec), GFP_KERNEL, node); if (!rqstp->rq_bvec) goto out_enomem; rqstp->rq_err = -EAGAIN; /* No error yet */ serv->sv_nrthreads += 1; pool->sp_nrthreads += 1; /* Protected by whatever lock the service uses when calling * svc_set_num_threads() */ list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); return rqstp; out_enomem: svc_rqst_free(rqstp); return NULL; } /** * svc_pool_wake_idle_thread - Awaken an idle thread in @pool * @pool: service thread pool * * Can be called from soft IRQ or process context. Finding an idle * service thread and marking it BUSY is atomic with respect to * other calls to svc_pool_wake_idle_thread(). * */ void svc_pool_wake_idle_thread(struct svc_pool *pool) { struct svc_rqst *rqstp; struct llist_node *ln; rcu_read_lock(); ln = READ_ONCE(pool->sp_idle_threads.first); if (ln) { rqstp = llist_entry(ln, struct svc_rqst, rq_idle); WRITE_ONCE(rqstp->rq_qtime, ktime_get()); if (!task_is_running(rqstp->rq_task)) { wake_up_process(rqstp->rq_task); trace_svc_pool_thread_wake(pool, rqstp->rq_task->pid); percpu_counter_inc(&pool->sp_threads_woken); } else { trace_svc_pool_thread_running(pool, rqstp->rq_task->pid); } rcu_read_unlock(); return; } rcu_read_unlock(); trace_svc_pool_thread_noidle(pool, 0); } EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread); static struct svc_pool * svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) { return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools]; } static struct svc_pool * svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool, unsigned int *state) { struct svc_pool *pool; unsigned int i; pool = target_pool; if (!pool) { for (i = 0; i < serv->sv_nrpools; i++) { pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; if (pool->sp_nrthreads) break; } } if (pool && pool->sp_nrthreads) { set_bit(SP_VICTIM_REMAINS, &pool->sp_flags); set_bit(SP_NEED_VICTIM, &pool->sp_flags); return pool; } return NULL; } static int svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { struct svc_rqst *rqstp; struct task_struct *task; struct svc_pool *chosen_pool; unsigned int state = serv->sv_nrthreads-1; int node; int err; do { nrservs--; chosen_pool = svc_pool_next(serv, pool, &state); node = svc_pool_map_get_node(chosen_pool->sp_id); rqstp = svc_prepare_thread(serv, chosen_pool, node); if (!rqstp) return -ENOMEM; task = kthread_create_on_node(serv->sv_threadfn, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { svc_exit_thread(rqstp); return PTR_ERR(task); } rqstp->rq_task = task; if (serv->sv_nrpools > 1) svc_pool_map_set_cpumask(task, chosen_pool->sp_id); svc_sock_update_bufs(serv); wake_up_process(task); wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN); err = rqstp->rq_err; if (err) { svc_exit_thread(rqstp); return err; } } while (nrservs > 0); return 0; } static int svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { unsigned int state = serv->sv_nrthreads-1; struct svc_pool *victim; do { victim = svc_pool_victim(serv, pool, &state); if (!victim) break; svc_pool_wake_idle_thread(victim); wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS, TASK_IDLE); nrservs++; } while (nrservs < 0); return 0; } /** * svc_set_num_threads - adjust number of threads per RPC service * @serv: RPC service to adjust * @pool: Specific pool from which to choose threads, or NULL * @nrservs: New number of threads for @serv (0 or less means kill all threads) * * Create or destroy threads to make the number of threads for @serv the * given number. If @pool is non-NULL, change only threads in that pool; * otherwise, round-robin between all pools for @serv. @serv's * sv_nrthreads is adjusted for each thread created or destroyed. * * Caller must ensure mutual exclusion between this and server startup or * shutdown. * * Returns zero on success or a negative errno if an error occurred while * starting a thread. */ int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { if (!pool) nrservs -= serv->sv_nrthreads; else nrservs -= pool->sp_nrthreads; if (nrservs > 0) return svc_start_kthreads(serv, pool, nrservs); if (nrservs < 0) return svc_stop_kthreads(serv, pool, nrservs); return 0; } EXPORT_SYMBOL_GPL(svc_set_num_threads); /** * svc_rqst_replace_page - Replace one page in rq_pages[] * @rqstp: svc_rqst with pages to replace * @page: replacement page * * When replacing a page in rq_pages, batch the release of the * replaced pages to avoid hammering the page allocator. * * Return values: * %true: page replaced * %false: array bounds checking failed */ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { struct page **begin = rqstp->rq_pages; struct page **end = &rqstp->rq_pages[rqstp->rq_maxpages]; if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { trace_svc_replace_page_err(rqstp); return false; } if (*rqstp->rq_next_page) { if (!folio_batch_add(&rqstp->rq_fbatch, page_folio(*rqstp->rq_next_page))) __folio_batch_release(&rqstp->rq_fbatch); } get_page(page); *(rqstp->rq_next_page++) = page; return true; } EXPORT_SYMBOL_GPL(svc_rqst_replace_page); /** * svc_rqst_release_pages - Release Reply buffer pages * @rqstp: RPC transaction context * * Release response pages that might still be in flight after * svc_send, and any spliced filesystem-owned pages. */ void svc_rqst_release_pages(struct svc_rqst *rqstp) { int i, count = rqstp->rq_next_page - rqstp->rq_respages; if (count) { release_pages(rqstp->rq_respages, count); for (i = 0; i < count; i++) rqstp->rq_respages[i] = NULL; } } /** * svc_exit_thread - finalise the termination of a sunrpc server thread * @rqstp: the svc_rqst which represents the thread. * * When a thread started with svc_new_thread() exits it must call * svc_exit_thread() as its last act. This must be done with the * service mutex held. Normally this is held by a DIFFERENT thread, the * one that is calling svc_set_num_threads() and which will wait for * SP_VICTIM_REMAINS to be cleared before dropping the mutex. If the * thread exits for any reason other than svc_thread_should_stop() * returning %true (which indicated that svc_set_num_threads() is * waiting for it to exit), then it must take the service mutex itself, * which can only safely be done using mutex_try_lock(). */ void svc_exit_thread(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct svc_pool *pool = rqstp->rq_pool; list_del_rcu(&rqstp->rq_all); pool->sp_nrthreads -= 1; serv->sv_nrthreads -= 1; svc_sock_update_bufs(serv); svc_rqst_free(rqstp); clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags); } EXPORT_SYMBOL_GPL(svc_exit_thread); /* * Register an "inet" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. * * No netconfig infrastructure is available in the kernel, so * we map IP_ protocol numbers to netids by hand. * * Returns zero on success; a negative errno value is returned * if any error occurs. */ static int __svc_rpcb_register4(struct net *net, const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { const struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; const char *netid; int error; switch (protocol) { case IPPROTO_UDP: netid = RPCBIND_NETID_UDP; break; case IPPROTO_TCP: netid = RPCBIND_NETID_TCP; break; default: return -ENOPROTOOPT; } error = rpcb_v4_register(net, program, version, (const struct sockaddr *)&sin, netid); /* * User space didn't support rpcbind v4, so retry this * registration request with the legacy rpcbind v2 protocol. */ if (error == -EPROTONOSUPPORT) error = rpcb_register(net, program, version, protocol, port); return error; } #if IS_ENABLED(CONFIG_IPV6) /* * Register an "inet6" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. * * No netconfig infrastructure is available in the kernel, so * we map IP_ protocol numbers to netids by hand. * * Returns zero on success; a negative errno value is returned * if any error occurs. */ static int __svc_rpcb_register6(struct net *net, const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { const struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; const char *netid; int error; switch (protocol) { case IPPROTO_UDP: netid = RPCBIND_NETID_UDP6; break; case IPPROTO_TCP: netid = RPCBIND_NETID_TCP6; break; default: return -ENOPROTOOPT; } error = rpcb_v4_register(net, program, version, (const struct sockaddr *)&sin6, netid); /* * User space didn't support rpcbind version 4, so we won't * use a PF_INET6 listener. */ if (error == -EPROTONOSUPPORT) error = -EAFNOSUPPORT; return error; } #endif /* IS_ENABLED(CONFIG_IPV6) */ /* * Register a kernel RPC service via rpcbind version 4. * * Returns zero on success; a negative errno value is returned * if any error occurs. */ static int __svc_register(struct net *net, const char *progname, const u32 program, const u32 version, const int family, const unsigned short protocol, const unsigned short port) { int error = -EAFNOSUPPORT; switch (family) { case PF_INET: error = __svc_rpcb_register4(net, program, version, protocol, port); break; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: error = __svc_rpcb_register6(net, program, version, protocol, port); #endif } trace_svc_register(progname, version, family, protocol, port, error); return error; } static int svc_rpcbind_set_version(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { return __svc_register(net, progp->pg_name, progp->pg_prog, version, family, proto, port); } int svc_generic_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { const struct svc_version *vers = progp->pg_vers[version]; int error; if (vers == NULL) return 0; if (vers->vs_hidden) { trace_svc_noregister(progp->pg_name, version, proto, port, family, 0); return 0; } /* * Don't register a UDP port if we need congestion * control. */ if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP) return 0; error = svc_rpcbind_set_version(net, progp, version, family, proto, port); return (vers->vs_rpcb_optnl) ? 0 : error; } EXPORT_SYMBOL_GPL(svc_generic_rpcbind_set); /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register * @net: net namespace for the service to register * @family: protocol family of service's listener socket * @proto: transport protocol number to advertise * @port: port to advertise * * Service is registered for any address in the passed-in protocol family */ int svc_register(const struct svc_serv *serv, struct net *net, const int family, const unsigned short proto, const unsigned short port) { unsigned int p, i; int error = 0; WARN_ON_ONCE(proto == 0 && port == 0); if (proto == 0 && port == 0) return -EINVAL; for (p = 0; p < serv->sv_nprogs; p++) { struct svc_program *progp = &serv->sv_programs[p]; for (i = 0; i < progp->pg_nvers; i++) { error = progp->pg_rpcbind_set(net, progp, i, family, proto, port); if (error < 0) { printk(KERN_WARNING "svc: failed to register " "%sv%u RPC service (errno %d).\n", progp->pg_name, i, -error); break; } } } return error; } /* * If user space is running rpcbind, it should take the v4 UNSET * and clear everything for this [program, version]. If user space * is running portmap, it will reject the v4 UNSET, but won't have * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient * in this case to clear all existing entries for [program, version]. */ static void __svc_unregister(struct net *net, const u32 program, const u32 version, const char *progname) { int error; error = rpcb_v4_register(net, program, version, NULL, ""); /* * User space didn't support rpcbind v4, so retry this * request with the legacy rpcbind v2 protocol. */ if (error == -EPROTONOSUPPORT) error = rpcb_register(net, program, version, 0, 0); trace_svc_unregister(progname, version, error); } /* * All netids, bind addresses and ports registered for [program, version] * are removed from the local rpcbind database (if the service is not * hidden) to make way for a new instance of the service. * * The result of unregistration is reported via dprintk for those who want * verification of the result, but is otherwise not important. */ static void svc_unregister(const struct svc_serv *serv, struct net *net) { struct sighand_struct *sighand; unsigned long flags; unsigned int p, i; clear_thread_flag(TIF_SIGPENDING); for (p = 0; p < serv->sv_nprogs; p++) { struct svc_program *progp = &serv->sv_programs[p]; for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; if (progp->pg_vers[i]->vs_hidden) continue; __svc_unregister(net, progp->pg_prog, i, progp->pg_name); } } rcu_read_lock(); sighand = rcu_dereference(current->sighand); spin_lock_irqsave(&sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(&sighand->siglock, flags); rcu_read_unlock(); } /* * dprintk the given error with the address of the client that caused it. */ #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static __printf(2, 3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) { struct va_format vaf; va_list args; char buf[RPC_MAX_ADDRBUFLEN]; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dprintk("svc: %s: %pV", svc_print_addr(rqstp, buf, sizeof(buf)), &vaf); va_end(args); } #else static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {} #endif __be32 svc_generic_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { const struct svc_version *versp = NULL; /* compiler food */ const struct svc_procedure *procp = NULL; if (rqstp->rq_vers >= progp->pg_nvers ) goto err_bad_vers; versp = progp->pg_vers[rqstp->rq_vers]; if (!versp) goto err_bad_vers; /* * Some protocol versions (namely NFSv4) require some form of * congestion control. (See RFC 7530 section 3.1 paragraph 2) * In other words, UDP is not allowed. We mark those when setting * up the svc_xprt, and verify that here. * * The spec is not very clear about what error should be returned * when someone tries to access a server that is listening on UDP * for lower versions. RPC_PROG_MISMATCH seems to be the closest * fit. */ if (versp->vs_need_cong_ctrl && rqstp->rq_xprt && !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) goto err_bad_vers; if (rqstp->rq_proc >= versp->vs_nproc) goto err_bad_proc; rqstp->rq_procinfo = procp = &versp->vs_proc[rqstp->rq_proc]; /* Initialize storage for argp and resp */ memset(rqstp->rq_argp, 0, procp->pc_argzero); memset(rqstp->rq_resp, 0, procp->pc_ressize); /* Bump per-procedure stats counter */ this_cpu_inc(versp->vs_count[rqstp->rq_proc]); ret->dispatch = versp->vs_dispatch; return rpc_success; err_bad_vers: ret->mismatch.lovers = progp->pg_lovers; ret->mismatch.hivers = progp->pg_hivers; return rpc_prog_mismatch; err_bad_proc: return rpc_proc_unavail; } EXPORT_SYMBOL_GPL(svc_generic_init_request); /* * Common routine for processing the RPC request. */ static int svc_process_common(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_res_stream; struct svc_program *progp = NULL; const struct svc_procedure *procp = NULL; struct svc_serv *serv = rqstp->rq_server; struct svc_process_info process; enum svc_auth_status auth_res; unsigned int aoffset; int pr, rc; __be32 *p; /* Reset the accept_stat for the RPC */ rqstp->rq_accept_statp = NULL; /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); /* Construct the first words of the reply: */ svcxdr_init_encode(rqstp); xdr_stream_encode_be32(xdr, rqstp->rq_xid); xdr_stream_encode_be32(xdr, rpc_reply); p = xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 4); if (unlikely(!p)) goto err_short_len; if (*p++ != cpu_to_be32(RPC_VERSION)) goto err_bad_rpc; xdr_stream_encode_be32(xdr, rpc_msg_accepted); rqstp->rq_prog = be32_to_cpup(p++); rqstp->rq_vers = be32_to_cpup(p++); rqstp->rq_proc = be32_to_cpup(p); for (pr = 0; pr < serv->sv_nprogs; pr++) if (rqstp->rq_prog == serv->sv_programs[pr].pg_prog) progp = &serv->sv_programs[pr]; /* * Decode auth data, and add verifier to reply buffer. * We do this before anything else in order to get a decent * auth verifier. */ auth_res = svc_authenticate(rqstp); /* Also give the program a chance to reject this call: */ if (auth_res == SVC_OK && progp) auth_res = progp->pg_authenticate(rqstp); trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { case SVC_OK: break; case SVC_GARBAGE: rqstp->rq_auth_stat = rpc_autherr_badcred; goto err_bad_auth; case SVC_DENIED: goto err_bad_auth; case SVC_CLOSE: goto close; case SVC_DROP: goto dropit; case SVC_COMPLETE: goto sendit; default: pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res); rqstp->rq_auth_stat = rpc_autherr_failed; goto err_bad_auth; } if (progp == NULL) goto err_bad_prog; switch (progp->pg_init_request(rqstp, progp, &process)) { case rpc_success: break; case rpc_prog_unavail: goto err_bad_prog; case rpc_prog_mismatch: goto err_bad_vers; case rpc_proc_unavail: goto err_bad_proc; } procp = rqstp->rq_procinfo; /* Should this check go into the dispatcher? */ if (!procp || !procp->pc_func) goto err_bad_proc; /* Syntactic check complete */ if (serv->sv_stats) serv->sv_stats->rpccnt++; trace_svc_process(rqstp, progp->pg_name); aoffset = xdr_stream_pos(xdr); /* un-reserve some of the out-queue now that we have a * better idea of reply size */ if (procp->pc_xdrressize) svc_reserve_auth(rqstp, procp->pc_xdrressize<<2); /* Call the function that processes the request. */ rc = process.dispatch(rqstp); if (procp->pc_release) procp->pc_release(rqstp); xdr_finish_decode(xdr); if (!rc) goto dropit; if (rqstp->rq_auth_stat != rpc_auth_ok) goto err_bad_auth; if (*rqstp->rq_accept_statp != rpc_success) xdr_truncate_encode(xdr, aoffset); if (procp->pc_encode == NULL) goto dropit; sendit: if (svc_authorise(rqstp)) goto close_xprt; return 1; /* Caller can now send it */ dropit: svc_authorise(rqstp); /* doesn't hurt to call this twice */ dprintk("svc: svc_process dropit\n"); return 0; close: svc_authorise(rqstp); close_xprt: if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) svc_xprt_close(rqstp->rq_xprt); dprintk("svc: svc_process close\n"); return 0; err_short_len: svc_printk(rqstp, "short len %u, dropping request\n", rqstp->rq_arg.len); goto close_xprt; err_bad_rpc: if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_MISMATCH); /* Only RPCv2 supported */ xdr_stream_encode_u32(xdr, RPC_VERSION); xdr_stream_encode_u32(xdr, RPC_VERSION); return 1; /* don't wrap */ err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); if (serv->sv_stats) serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of reply status: */ xdr_truncate_encode(xdr, XDR_UNIT * 2); xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_AUTH_ERROR); xdr_stream_encode_be32(xdr, rqstp->rq_auth_stat); goto sendit; err_bad_prog: dprintk("svc: unknown program %d\n", rqstp->rq_prog); if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; err_bad_vers: svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_mismatch; /* * svc_authenticate() has already added the verifier and * advanced the stream just past rq_accept_statp. */ xdr_stream_encode_u32(xdr, process.mismatch.lovers); xdr_stream_encode_u32(xdr, process.mismatch.hivers); goto sendit; err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; } /* * Drop request */ static void svc_drop(struct svc_rqst *rqstp) { trace_svc_drop(rqstp); } /** * svc_process - Execute one RPC transaction * @rqstp: RPC transaction context * */ void svc_process(struct svc_rqst *rqstp) { struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; #if IS_ENABLED(CONFIG_FAIL_SUNRPC) if (!fail_sunrpc.ignore_server_disconnect && should_fail(&fail_sunrpc.attr, 1)) svc_xprt_deferred_close(rqstp->rq_xprt); #endif /* * Setup response xdr_buf. * Initially it has just one page */ rqstp->rq_next_page = &rqstp->rq_respages[1]; resv->iov_base = page_address(rqstp->rq_respages[0]); resv->iov_len = 0; rqstp->rq_res.pages = rqstp->rq_next_page; rqstp->rq_res.len = 0; rqstp->rq_res.page_base = 0; rqstp->rq_res.page_len = 0; rqstp->rq_res.buflen = PAGE_SIZE; rqstp->rq_res.tail[0].iov_base = NULL; rqstp->rq_res.tail[0].iov_len = 0; svcxdr_init_decode(rqstp); p = xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2); if (unlikely(!p)) goto out_drop; rqstp->rq_xid = *p++; if (unlikely(*p != rpc_call)) goto out_baddir; if (!svc_process_common(rqstp)) goto out_drop; svc_send(rqstp); return; out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", be32_to_cpu(*p)); if (rqstp->rq_server->sv_stats) rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) /** * svc_process_bc - process a reverse-direction RPC request * @req: RPC request to be used for client-side processing * @rqstp: server-side execution context * */ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) { struct rpc_timeout timeout = { .to_increment = 0, }; struct rpc_task *task; int proc_error; /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xid = req->rq_xid; rqstp->rq_prot = req->rq_xprt->prot; rqstp->rq_bc_net = req->rq_xprt->xprt_net; rqstp->rq_addrlen = sizeof(req->rq_xprt->addr); memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen); memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg)); memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res)); /* Adjust the argument buffer length */ rqstp->rq_arg.len = req->rq_private_buf.len; if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; rqstp->rq_arg.page_len = 0; } else if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; else rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len; /* Reset the response buffer */ rqstp->rq_res.head[0].iov_len = 0; /* * Skip the XID and calldir fields because they've already * been processed by the caller. */ svcxdr_init_decode(rqstp); if (!xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2)) return; /* Parse and execute the bc call */ proc_error = svc_process_common(rqstp); atomic_dec(&req->rq_xprt->bc_slot_count); if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); return; } /* Finally, send the reply synchronously */ if (rqstp->bc_to_initval > 0) { timeout.to_initval = rqstp->bc_to_initval; timeout.to_retries = rqstp->bc_to_retries; } else { timeout.to_initval = req->rq_xprt->timeout->to_initval; timeout.to_retries = req->rq_xprt->timeout->to_retries; } timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); if (IS_ERR(task)) return; WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); rpc_put_task(task); } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** * svc_max_payload - Return transport-specific limit on the RPC payload * @rqstp: RPC transaction context * * Returns the maximum number of payload bytes the current transport * allows. */ u32 svc_max_payload(const struct svc_rqst *rqstp) { u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload; if (rqstp->rq_server->sv_max_payload < max) max = rqstp->rq_server->sv_max_payload; return max; } EXPORT_SYMBOL_GPL(svc_max_payload); /** * svc_proc_name - Return RPC procedure name in string form * @rqstp: svc_rqst to operate on * * Return value: * Pointer to a NUL-terminated string */ const char *svc_proc_name(const struct svc_rqst *rqstp) { if (rqstp && rqstp->rq_procinfo) return rqstp->rq_procinfo->pc_name; return "unknown"; } /** * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in rqstp->rq_res * @length: size of payload, in bytes * * Returns zero on success, or a negative errno if a permanent * error occurred. */ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length) { return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset, length); } EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call * @rqstp: svc_rqst to operate on * @first: buffer containing first section of pathname * @p: buffer containing remaining section of pathname * @total: total length of the pathname argument * * The VFS symlink API demands a NUL-terminated pathname in mapped memory. * Returns pointer to a NUL-terminated string, or an ERR_PTR. Caller must free * the returned string. */ char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total) { size_t len, remaining; char *result, *dst; result = kmalloc(total + 1, GFP_KERNEL); if (!result) return ERR_PTR(-ESERVERFAULT); dst = result; remaining = total; len = min_t(size_t, total, first->iov_len); if (len) { memcpy(dst, first->iov_base, len); dst += len; remaining -= len; } if (remaining) { len = min_t(size_t, remaining, PAGE_SIZE); memcpy(dst, p, len); dst += len; } *dst = '\0'; /* Sanity check: Linux doesn't allow the pathname argument to * contain a NUL byte. */ if (strlen(result) != total) { kfree(result); return ERR_PTR(-EINVAL); } return result; } EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname); |
| 95 7683 872 8706 872 8707 867 342 342 342 120 342 298 302 27 8269 7685 302 2658 1006 999 8485 8469 8449 1005 5630 2648 2658 2648 2660 7674 7646 7649 4529 965 887 47 47 47 47 453 452 245 376 7672 7672 7675 7646 7647 454 7548 3459 7538 209 207 208 207 208 209 207 207 207 201 208 207 299 3469 3558 3550 298 298 299 299 298 299 299 298 299 299 299 7711 7686 7682 410 411 120 120 120 130 7 7 2 7 3466 3464 3443 7667 3479 7679 3468 6025 3470 7664 3470 3469 593 593 591 592 7655 152 8 8 8 8 8 1 7 7 3 4 7 8 277 141 277 277 152 141 141 8 302 277 303 303 3348 3337 2671 3342 3138 305 305 304 303 301 302 46 303 302 303 7 8 8 3339 3334 8 7683 7685 7674 7682 3479 7693 7666 7692 7659 7390 7679 5715 7691 7665 4746 3482 3481 3481 390 3433 7211 7664 7693 7667 7673 7665 7646 7605 7645 603 7652 7690 9002 8996 9004 7092 958 7046 7046 7094 7070 33 33 33 33 33 33 33 33 4216 4217 4223 4221 4220 1723 3480 3481 3482 3602 2356 1751 3545 3533 3616 3609 1362 1367 1365 1367 1364 245 245 244 242 179 179 179 799 240 240 240 240 872 524 742 873 593 594 594 595 592 593 171 593 592 591 592 27 593 592 594 592 594 595 590 594 1099 1099 1101 873 592 593 593 594 593 595 587 27 27 27 591 27 27 27 27 27 27 27 27 593 5 130 129 130 130 129 130 129 130 130 130 3232 3219 2354 1875 740 739 1829 1823 524 873 783 873 873 873 551 873 1809 3203 3219 3220 3232 106 106 235 871 867 3217 1349 120 732 730 295 730 120 118 117 120 2653 2267 2254 2646 2661 2661 2661 2663 2648 2264 2264 2266 2263 2263 2260 2265 1239 1249 1539 24 2257 2260 2254 2254 2264 2265 2257 9 29 3 3 3 299 299 299 299 299 299 298 298 299 299 121 121 121 121 121 299 299 298 299 299 299 299 299 299 299 299 299 299 297 299 298 298 298 299 298 297 299 299 298 298 299 299 297 121 121 299 299 299 299 299 299 299 299 298 299 299 298 298 298 298 299 299 202 121 298 201 298 299 298 299 298 299 299 299 298 299 299 297 299 296 297 76 76 76 76 298 202 203 121 231 298 299 299 299 179 179 178 179 156 51 178 299 299 299 299 299 298 299 296 75 299 298 1 299 299 231 299 299 299 299 297 299 299 297 299 298 299 120 119 120 120 119 120 97 120 120 120 120 120 120 120 120 120 120 120 120 120 9 9 1 1 1 1 18 18 4 4 4 3 3 119 5206 5214 5207 140 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 | // SPDX-License-Identifier: GPL-2.0-only /* * kernel/workqueue.c - generic async execution with shared worker pool * * Copyright (C) 2002 Ingo Molnar * * Derived from the taskqueue/keventd code by: * David Woodhouse <dwmw2@infradead.org> * Andrew Morton * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> * * Made to use alloc_percpu by Christoph Lameter. * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * This is the generic async execution mechanism. Work items as are * executed in process context. The worker pool is shared and * automatically managed. There are two worker pools for each CPU (one for * normal work items and the other for high priority ones) and some extra * pools for workqueues which are not bound to any specific CPU - the * number of these backing pools is dynamic. * * Please read Documentation/core-api/workqueue.rst for details. */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/signal.h> #include <linux/completion.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/kthread.h> #include <linux/hardirq.h> #include <linux/mempolicy.h> #include <linux/freezer.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> #include <linux/jhash.h> #include <linux/hashtable.h> #include <linux/rculist.h> #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/sched/isolation.h> #include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/kvm_para.h> #include <linux/delay.h> #include <linux/irq_work.h> #include "workqueue_internal.h" enum worker_pool_flags { /* * worker_pool flags * * A bound pool is either associated or disassociated with its CPU. * While associated (!DISASSOCIATED), all workers are bound to the * CPU and none has %WORKER_UNBOUND set and concurrency management * is in effect. * * While DISASSOCIATED, the cpu may be offline and all workers have * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. * * As there can only be one concurrent BH execution context per CPU, a * BH pool is per-CPU and always DISASSOCIATED. */ POOL_BH = 1 << 0, /* is a BH pool */ POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */ }; enum worker_flags { /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ WORKER_REBOUND = 1 << 8, /* worker was rebound */ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND | WORKER_REBOUND, }; enum work_cancel_flags { WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ }; enum wq_internal_consts { NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, /* call for help after 10ms (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ /* * Rescue workers are used only on emergencies and shared by * all cpus. Give MIN_NICE. */ RESCUER_NICE_LEVEL = MIN_NICE, HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 32, WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ }; /* * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because * msecs_to_jiffies() can't be an initializer. */ #define BH_WORKER_JIFFIES msecs_to_jiffies(2) #define BH_WORKER_RESTARTS 10 /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only for * everyone else. * * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. * * L: pool->lock protected. Access with pool->lock held. * * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for * reads. * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. * * S: Only modified by worker self. * * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. * * PR: wq_pool_mutex protected for writes. RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or * RCU for reads. * * WQ: wq->mutex protected. * * WR: wq->mutex protected for writes. RCU protected for reads. * * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read * with READ_ONCE() without locking. * * MD: wq_mayday_lock protected. * * WD: Used internally by the watchdog. */ /* struct worker is defined in workqueue_internal.h */ struct worker_pool { raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ /* * The counter is incremented in a process context on the associated CPU * w/ preemption disabled, and decremented or reset in the same context * but w/ pool->lock held. The readers grab pool->lock and are * guaranteed to see if the counter reached zero. */ int nr_running; struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ int nr_idle; /* L: currently idle workers */ struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct work_struct idle_cull_work; /* L: worker idle cleanup */ struct timer_list mayday_timer; /* L: SOS timer for workers */ /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ struct ida worker_ida; /* worker IDs for task name */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; }; /* * Per-pool_workqueue statistics. These can be monitored using * tools/workqueue/wq_monitor.py. */ enum pool_workqueue_stats { PWQ_STAT_STARTED, /* work items started execution */ PWQ_STAT_COMPLETED, /* work items completed execution */ PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ PWQ_NR_STATS, }; /* * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the * number of flag bits. */ struct pool_workqueue { struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ bool plugged; /* L: execution suspended */ /* * nr_active management and WORK_STRUCT_INACTIVE: * * When pwq->nr_active >= max_active, new work item is queued to * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * * All work items marked with WORK_STRUCT_INACTIVE do not participate in * nr_active and all work items in pwq->inactive_works are marked with * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are * in pwq->inactive_works. Some of them are ready to run in * pool->worklist or worker->scheduled. Those work itmes are only struct * wq_barrier which is used for flush_work() and should not participate * in nr_active. For non-barrier work item, it is marked with * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ struct list_head inactive_works; /* L: inactive works */ struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ u64 stats[PWQ_NR_STATS]; /* * Release of unbound pwq is punted to a kthread_worker. See put_pwq() * and pwq_release_workfn() for details. pool_workqueue itself is also * RCU protected so that the first pwq can be determined without * grabbing wq->mutex. */ struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_PWQ_SHIFT); /* * Structure used to wait for workqueue flush. */ struct wq_flusher { struct list_head list; /* WQ: list of flushers */ int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; struct wq_device; /* * Unlike in a per-cpu workqueue where max_active limits its concurrency level * on each CPU, in an unbound workqueue, max_active applies to the whole system. * As sharing a single nr_active across multiple sockets can be very expensive, * the counting and enforcement is per NUMA node. * * The following struct is used to enforce per-node max_active. When a pwq wants * to start executing a work item, it should increment ->nr using * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in * round-robin order. */ struct wq_node_nr_active { int max; /* per-node max_active */ atomic_t nr; /* per-node nr_active */ raw_spinlock_t lock; /* nests inside pool locks */ struct list_head pending_pwqs; /* LN: pwqs with inactive works */ }; /* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ struct wq_flusher *first_flusher; /* WQ: first flusher */ struct list_head flusher_queue; /* WQ: flush waiters */ struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ /* See alloc_workqueue() function comment for info on min/max_active */ int max_active; /* WO: max active works */ int min_active; /* WO: min active works */ int saved_max_active; /* WQ: saved max_active */ int saved_min_active; /* WQ: saved min_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ #endif #ifdef CONFIG_LOCKDEP char *lock_name; struct lock_class_key key; struct lockdep_map __lockdep_map; struct lockdep_map *lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ /* * Destruction of workqueue_struct is RCU protected to allow walking * the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */ struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; /* * Each pod type describes how CPUs should be grouped for unbound workqueues. * See the comment above workqueue_attrs->affn_scope. */ struct wq_pod_type { int nr_pods; /* number of pods */ cpumask_var_t *pod_cpus; /* pod -> cpus */ int *pod_node; /* pod -> node */ int *cpu_pod; /* cpu -> pod */ }; struct work_offq_data { u32 pool_id; u32 disable; u32 flags; }; static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", [WQ_AFFN_SMT] = "smt", [WQ_AFFN_CACHE] = "cache", [WQ_AFFN_NUMA] = "numa", [WQ_AFFN_SYSTEM] = "system", }; /* * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT static unsigned int wq_cpu_intensive_warning_thresh = 4; module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644); #endif /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ static bool wq_topo_initialized __read_mostly = false; static struct kmem_cache *pwq_cache; static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ /* wait for manager to go away */ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */ static cpumask_var_t wq_online_cpumask; /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; /* PL: user requested unbound cpumask via sysfs */ static cpumask_var_t wq_requested_unbound_cpumask; /* PL: isolated cpumask to be excluded from unbound cpumask */ static cpumask_var_t wq_isolated_cpumask; /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); /* * Local execution of unbound work items is no longer guaranteed. The * following always forces round-robin CPU selection on unbound work items * to uncover usages which depend on it. */ #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU static bool wq_debug_force_rr_cpu = true; #else static bool wq_debug_force_rr_cpu = false; #endif module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* to raise softirq for the BH worker pools on other CPUs */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works); /* the BH worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ /* PL: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; /* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. */ static struct kthread_worker *pwq_release_worker __ro_after_init; struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_percpu_wq __ro_after_init; EXPORT_SYMBOL(system_percpu_wq); struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_dfl_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_dfl_wq); struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); struct workqueue_struct *system_bh_wq; EXPORT_SYMBOL_GPL(system_bh_wq); struct workqueue_struct *system_bh_highpri_wq; EXPORT_SYMBOL_GPL(system_bh_highpri_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); static void show_pwq(struct pool_workqueue *pwq); static void show_one_worker_pool(struct worker_pool *pool); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_bh_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor * @pi: integer used for iteration * * This must be called either with wq_pool_mutex held or RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ if (({ assert_rcu_or_pool_mutex(); false; })) { } \ else /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor * @pool: worker_pool to iterate workers of * * This must be called with wq_pool_attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ else /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor * @wq: the target workqueue * * This must be called either with wq->mutex held or RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ lockdep_is_held(&(wq->mutex))) #ifdef CONFIG_DEBUG_OBJECTS_WORK static const struct debug_obj_descr work_debug_descr; static void *work_debug_hint(void *addr) { return ((struct work_struct *) addr)->func; } static bool work_is_static_object(void *addr) { struct work_struct *work = addr; return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); } /* * fixup_init is called when: * - an active object is initialized */ static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_init(work, &work_debug_descr); return true; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); return true; default: return false; } } static const struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, .fixup_free = work_fixup_free, }; static inline void debug_work_activate(struct work_struct *work) { debug_object_activate(work, &work_debug_descr); } static inline void debug_work_deactivate(struct work_struct *work) { debug_object_deactivate(work, &work_debug_descr); } void __init_work(struct work_struct *work, int onstack) { if (onstack) debug_object_init_on_stack(work, &work_debug_descr); else debug_object_init(work, &work_debug_descr); } EXPORT_SYMBOL_GPL(__init_work); void destroy_work_on_stack(struct work_struct *work) { debug_object_free(work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_work_on_stack); void destroy_delayed_work_on_stack(struct delayed_work *work) { timer_destroy_on_stack(&work->timer); debug_object_free(&work->work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned * successfully, -errno on failure. */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; } return ret; } static struct pool_workqueue __rcu ** unbound_pwq_slot(struct workqueue_struct *wq, int cpu) { if (cpu >= 0) return per_cpu_ptr(wq->cpu_pwq, cpu); else return &wq->dfl_pwq; } /* @cpu < 0 for dfl_pwq */ static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) { return rcu_dereference_check(*unbound_pwq_slot(wq, cpu), lockdep_is_held(&wq_pool_mutex) || lockdep_is_held(&wq->mutex)); } /** * unbound_effective_cpumask - effective cpumask of an unbound workqueue * @wq: workqueue of interest * * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which * is masked with wq_unbound_cpumask to determine the effective cpumask. The * default pwq is always mapped to the pool with the current effective cpumask. */ static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) { return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; } static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; } static int get_work_color(unsigned long work_data) { return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } static int work_next_color(int color) { return (color + 1) % WORK_NR_COLORS; } static unsigned long pool_offq_flags(struct worker_pool *pool) { return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; } /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and pool ID. * * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling() * can be used to set the pwq, pool or clear work->data. These functions should * only be called while the work is owned - ie. while the PENDING bit is set. * * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. */ static inline void set_work_data(struct work_struct *work, unsigned long data) { WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | work_static(work)); } static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, unsigned long flags) { set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | flags); } static void set_work_pool_and_keep_pending(struct work_struct *work, int pool_id, unsigned long flags) { set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | WORK_STRUCT_PENDING | flags); } static void set_work_pool_and_clear_pending(struct work_struct *work, int pool_id, unsigned long flags) { /* * The following wmb is paired with the implied mb in * test_and_set_bit(PENDING) and ensures all updates to @work made * here are visible to and precede any updates by the next PENDING * owner. */ smp_wmb(); set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | flags); /* * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible * reordering can lead to a missed execution on attempt to queue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 * ---------------------------- -------------------------------- * * 1 STORE event_indicated * 2 queue_work_on() { * 3 test_and_set_bit(PENDING) * 4 } set_..._and_clear_pending() { * 5 set_work_data() # clear bit * 6 smp_mb() * 7 work->current_func() { * 8 LOAD event_indicated * } * * Without an explicit full barrier speculative LOAD on line 8 can * be executed before CPU#0 does STORE on line 1. If that happens, * CPU#0 observes the PENDING bit is still set and new execution of * a @work is not queued in a hope, that CPU#1 will eventually * finish the queued @work. Meanwhile CPU#1 does not see * event_indicated is set, because speculative LOAD was executed * before actual STORE. */ smp_mb(); } static inline struct pool_workqueue *work_struct_pwq(unsigned long data) { return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK); } static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) return work_struct_pwq(data); else return NULL; } /** * get_work_pool - return the worker_pool a given work was associated with * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read * access under RCU read lock. As such, this function should be * called under wq_pool_mutex or inside of a rcu_read_lock() region. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used * beyond the critical section, the caller is responsible for ensuring the * returned pool is and stays online. * * Return: The worker_pool @work was last associated with. %NULL if none. */ static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); int pool_id; assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; return idr_find(&worker_pool_idr, pool_id); } static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { return (v >> shift) & ((1U << bits) - 1); } static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) { WARN_ON_ONCE(data & WORK_STRUCT_PWQ); offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS); offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, WORK_OFFQ_DISABLE_BITS); offqd->flags = data & WORK_OFFQ_FLAG_MASK; } static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) { return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | ((unsigned long)offqd->flags); } /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that * they're being called with pool->lock held. */ /* * Need to wake up a worker? Called from anything but currently * running workers. * * Note that, because unbound workers never contribute to nr_running, this * function will always return %true for unbound pools as long as the * worklist isn't empty. */ static bool need_more_worker(struct worker_pool *pool) { return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ static bool may_start_working(struct worker_pool *pool) { return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { return !list_empty(&pool->worklist) && (pool->nr_running <= 1); } /* Do we need a new worker? Called from manager. */ static bool need_to_create_worker(struct worker_pool *pool) { return need_more_worker(pool) && !may_start_working(pool); } /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { pool->nr_running--; } worker->flags |= flags; } /** * worker_clr_flags - clear worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; lockdep_assert_held(&pool->lock); worker->flags &= ~flags; /* * If transitioning out of NOT_RUNNING, increment nr_running. Note * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask * of multiple flags, not a single flag. */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) pool->nr_running++; } /* Return the first idle worker. Called with pool->lock held. */ static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) return NULL; return list_first_entry(&pool->idle_list, struct worker, entry); } /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state * * @worker is entering idle state. Update stats and idle timer if * necessary. * * LOCKING: * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || WARN_ON_ONCE(!list_empty(&worker->entry) && (worker->hentry.next || worker->hentry.pprev))) return; /* can't use worker_set_flags(), also called from create_worker() */ worker->flags |= WORKER_IDLE; pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ list_add(&worker->entry, &pool->idle_list); if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* Sanity check nr_running. */ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); } /** * worker_leave_idle - leave idle state * @worker: worker which is leaving idle state * * @worker is leaving idle state. Update stats. * * LOCKING: * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); } /** * find_worker_executing_work - find worker which is executing a work * @pool: pool of interest * @work: work to find worker for * * Find a worker which is executing @work on @pool by searching * @pool->busy_hash which is keyed by the address of @work. For a worker * to match, its current execution should match the address of @work and * its work function. This is to avoid unwanted dependency between * unrelated work executions through a work item being recycled while still * being executed. * * This is a bit tricky. A work item may be freed once its execution * starts and nothing prevents the freed area from being recycled for * another work item. If the same work item address ends up being reused * before the original execution finishes, workqueue will identify the * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * * This function checks the work item address and work function to avoid * false positives. Note that this isn't complete as one may construct a * work function which can introduce dependency onto itself through a * recycled work item. Well, if somebody wants to shoot oneself in the * foot that badly, there's only so much we can do, and if such deadlock * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL * otherwise. */ static struct worker *find_worker_executing_work(struct worker_pool *pool, struct work_struct *work) { struct worker *worker; hash_for_each_possible(pool->busy_hash, worker, hentry, (unsigned long)work) if (worker->current_work == work && worker->current_func == work->func) return worker; return NULL; } /** * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled * @head: target list to append @work to * @nextp: out parameter for nested worklist walking * * Schedule linked works starting from @work to @head. Work series to be * scheduled starts at @work and includes any consecutive work with * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on * @nextp. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) { struct work_struct *n; /* * Linked worklist will always end before the end of the list, * use NULL for list head. */ list_for_each_entry_safe_from(work, n, NULL, entry) { list_move_tail(&work->entry, head); if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) break; } /* * If we're already inside safe list traversal and have moved * multiple works to the scheduled queue, the next position * needs to be updated. */ if (nextp) *nextp = n; } /** * assign_work - assign a work item and its linked work items to a worker * @work: work to assign * @worker: worker to assign to * @nextp: out parameter for nested worklist walking * * Assign @work and its linked work items to @worker. If @work is already being * executed by another worker in the same pool, it'll be punted there. * * If @nextp is not NULL, it's updated to point to the next work of the last * scheduled work. This allows assign_work() to be nested inside * list_for_each_entry_safe(). * * Returns %true if @work was successfully assigned to @worker. %false if @work * was punted to another worker already executing it. */ static bool assign_work(struct work_struct *work, struct worker *worker, struct work_struct **nextp) { struct worker_pool *pool = worker->pool; struct worker *collision; lockdep_assert_held(&pool->lock); /* * A single work shouldn't be executed concurrently by multiple workers. * __queue_work() ensures that @work doesn't jump to a different pool * while still running in the previous pool. Here, we should ensure that * @work is not executed concurrently by multiple workers from the same * pool. Check whether anyone is already processing the work. If so, * defer the work to the currently executing one. */ collision = find_worker_executing_work(pool, work); if (unlikely(collision)) { move_linked_works(work, &collision->scheduled, nextp); return false; } move_linked_works(work, &worker->scheduled, nextp); return true; } static struct irq_work *bh_pool_irq_work(struct worker_pool *pool) { int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0; return &per_cpu(bh_pool_irq_works, pool->cpu)[high]; } static void kick_bh_pool(struct worker_pool *pool) { #ifdef CONFIG_SMP /* see drain_dead_softirq_workfn() for BH_DRAINING */ if (unlikely(pool->cpu != smp_processor_id() && !(pool->flags & POOL_BH_DRAINING))) { irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu); return; } #endif if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) raise_softirq_irqoff(HI_SOFTIRQ); else raise_softirq_irqoff(TASKLET_SOFTIRQ); } /** * kick_pool - wake up an idle worker if necessary * @pool: pool to kick * * @pool may have pending work items. Wake up worker if necessary. Returns * whether a worker was woken up. */ static bool kick_pool(struct worker_pool *pool) { struct worker *worker = first_idle_worker(pool); struct task_struct *p; lockdep_assert_held(&pool->lock); if (!need_more_worker(pool) || !worker) return false; if (pool->flags & POOL_BH) { kick_bh_pool(pool); return true; } p = worker->task; #ifdef CONFIG_SMP /* * Idle @worker is about to execute @work and waking up provides an * opportunity to migrate @worker at a lower cost by setting the task's * wake_cpu field. Let's see if we want to move @worker to improve * execution locality. * * We're waking the worker that went idle the latest and there's some * chance that @worker is marked idle but hasn't gone off CPU yet. If * so, setting the wake_cpu won't do anything. As this is a best-effort * optimization and the race window is narrow, let's leave as-is for * now. If this becomes pronounced, we can skip over workers which are * still on cpu when picking an idle worker. * * If @pool has non-strict affinity, @worker might have ended up outside * its affinity scope. Repatriate. */ if (!pool->attrs->affn_strict && !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask, cpu_online_mask); if (wake_cpu < nr_cpu_ids) { p->wake_cpu = wake_cpu; get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; } } #endif wake_up_process(p); return true; } #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT /* * Concurrency-managed per-cpu work items that hog CPU for longer than * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, * which prevents them from stalling other concurrency-managed work items. If a * work function keeps triggering this mechanism, it's likely that the work item * should be using an unbound workqueue instead. * * wq_cpu_intensive_report() tracks work functions which trigger such conditions * and report them so that they can be examined and converted to use unbound * workqueues as appropriate. To avoid flooding the console, each violating work * function is tracked and reported with exponential backoff. */ #define WCI_MAX_ENTS 128 struct wci_ent { work_func_t func; atomic64_t cnt; struct hlist_node hash_node; }; static struct wci_ent wci_ents[WCI_MAX_ENTS]; static int wci_nr_ents; static DEFINE_RAW_SPINLOCK(wci_lock); static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); static struct wci_ent *wci_find_ent(work_func_t func) { struct wci_ent *ent; hash_for_each_possible_rcu(wci_hash, ent, hash_node, (unsigned long)func) { if (ent->func == func) return ent; } return NULL; } static void wq_cpu_intensive_report(work_func_t func) { struct wci_ent *ent; restart: ent = wci_find_ent(func); if (ent) { u64 cnt; /* * Start reporting from the warning_thresh and back off * exponentially. */ cnt = atomic64_inc_return_relaxed(&ent->cnt); if (wq_cpu_intensive_warning_thresh && cnt >= wq_cpu_intensive_warning_thresh && is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh)) printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", ent->func, wq_cpu_intensive_thresh_us, atomic64_read(&ent->cnt)); return; } /* * @func is a new violation. Allocate a new entry for it. If wcn_ents[] * is exhausted, something went really wrong and we probably made enough * noise already. */ if (wci_nr_ents >= WCI_MAX_ENTS) return; raw_spin_lock(&wci_lock); if (wci_nr_ents >= WCI_MAX_ENTS) { raw_spin_unlock(&wci_lock); return; } if (wci_find_ent(func)) { raw_spin_unlock(&wci_lock); goto restart; } ent = &wci_ents[wci_nr_ents++]; ent->func = func; atomic64_set(&ent->cnt, 0); hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); raw_spin_unlock(&wci_lock); goto restart; } #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ static void wq_cpu_intensive_report(work_func_t func) {} #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ /** * wq_worker_running - a worker is running again * @task: task waking up * * This function is called when a worker returns from schedule() */ void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); if (!READ_ONCE(worker->sleeping)) return; /* * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check * and the nr_running increment below, we may ruin the nr_running reset * and leave with an unexpected pool->nr_running == 1 on the newly unbound * pool. Protect against such race. */ preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) worker->pool->nr_running++; preempt_enable(); /* * CPU intensive auto-detection cares about how long a work item hogged * CPU without sleeping. Reset the starting timestamp on wakeup. */ worker->current_at = worker->task->se.sum_exec_runtime; WRITE_ONCE(worker->sleeping, 0); } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep * * This function is called from schedule() when a busy worker is * going to sleep. */ void wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task); struct worker_pool *pool; /* * Rescuers, which may not have all the fields set up like normal * workers, also reach here, let's not access anything before * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) return; pool = worker->pool; /* Return if preempted before wq_worker_running() was reached */ if (READ_ONCE(worker->sleeping)) return; WRITE_ONCE(worker->sleeping, 1); raw_spin_lock_irq(&pool->lock); /* * Recheck in case unbind_workers() preempted us. We don't * want to decrement nr_running after the worker is unbound * and nr_running has been reset. */ if (worker->flags & WORKER_NOT_RUNNING) { raw_spin_unlock_irq(&pool->lock); return; } pool->nr_running--; if (kick_pool(pool)) worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; raw_spin_unlock_irq(&pool->lock); } /** * wq_worker_tick - a scheduler tick occurred while a kworker is running * @task: task currently running * * Called from sched_tick(). We're in the IRQ context and the current * worker's fields which follow the 'K' locking rule can be accessed safely. */ void wq_worker_tick(struct task_struct *task) { struct worker *worker = kthread_data(task); struct pool_workqueue *pwq = worker->current_pwq; struct worker_pool *pool = worker->pool; if (!pwq) return; pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; if (!wq_cpu_intensive_thresh_us) return; /* * If the current worker is concurrency managed and hogged the CPU for * longer than wq_cpu_intensive_thresh_us, it's automatically marked * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. * * Set @worker->sleeping means that @worker is in the process of * switching out voluntarily and won't be contributing to * @pool->nr_running until it wakes up. As wq_worker_sleeping() also * decrements ->nr_running, setting CPU_INTENSIVE here can lead to * double decrements. The task is releasing the CPU anyway. Let's skip. * We probably want to make this prettier in the future. */ if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || worker->task->se.sum_exec_runtime - worker->current_at < wq_cpu_intensive_thresh_us * NSEC_PER_USEC) return; raw_spin_lock(&pool->lock); worker_set_flags(worker, WORKER_CPU_INTENSIVE); wq_cpu_intensive_report(worker->current_func); pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; if (kick_pool(pool)) pwq->stats[PWQ_STAT_CM_WAKEUP]++; raw_spin_unlock(&pool->lock); } /** * wq_worker_last_func - retrieve worker's last work function * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. * * CONTEXT: * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during * dequeuing, to allow periodic aggregation to shut-off when that * worker is the last task in the system or cgroup to go to sleep. * * As this function doesn't involve any workqueue-related locking, it * only returns stable values when called from inside the scheduler's * queuing and dequeuing paths, when @task, which must be a kworker, * is guaranteed to not be processing any works. * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. */ work_func_t wq_worker_last_func(struct task_struct *task) { struct worker *worker = kthread_data(task); return worker->last_func; } /** * wq_node_nr_active - Determine wq_node_nr_active to use * @wq: workqueue of interest * @node: NUMA node, can be %NUMA_NO_NODE * * Determine wq_node_nr_active to use for @wq on @node. Returns: * * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. * * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. * * - Otherwise, node_nr_active[@node]. */ static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, int node) { if (!(wq->flags & WQ_UNBOUND)) return NULL; if (node == NUMA_NO_NODE) node = nr_node_ids; return wq->node_nr_active[node]; } /** * wq_update_node_max_active - Update per-node max_actives to use * @wq: workqueue to update * @off_cpu: CPU that's going down, -1 if a CPU is not going down * * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is * distributed among nodes according to the proportions of numbers of online * cpus. The result is always between @wq->min_active and max_active. */ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) { struct cpumask *effective = unbound_effective_cpumask(wq); int min_active = READ_ONCE(wq->min_active); int max_active = READ_ONCE(wq->max_active); int total_cpus, node; lockdep_assert_held(&wq->mutex); if (!wq_topo_initialized) return; if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective)) off_cpu = -1; total_cpus = cpumask_weight_and(effective, cpu_online_mask); if (off_cpu >= 0) total_cpus--; /* If all CPUs of the wq get offline, use the default values */ if (unlikely(!total_cpus)) { for_each_node(node) wq_node_nr_active(wq, node)->max = min_active; wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; return; } for_each_node(node) { int node_cpus; node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) node_cpus--; wq_node_nr_active(wq, node)->max = clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), min_active, max_active); } wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; } /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * * Obtain an extra reference on @pwq. The caller should guarantee that * @pwq has positive refcnt and be holding the matching pool->lock. */ static void get_pwq(struct pool_workqueue *pwq) { lockdep_assert_held(&pwq->pool->lock); WARN_ON_ONCE(pwq->refcnt <= 0); pwq->refcnt++; } /** * put_pwq - put a pool_workqueue reference * @pwq: pool_workqueue to put * * Drop a reference of @pwq. If its refcnt reaches zero, schedule its * destruction. The caller should be holding the matching pool->lock. */ static void put_pwq(struct pool_workqueue *pwq) { lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; /* * @pwq can't be released under pool->lock, bounce to a dedicated * kthread_worker to avoid A-A deadlocks. */ kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock * @pwq: pool_workqueue to put (can be %NULL) * * put_pwq() with locking. This function also allows %NULL @pwq. */ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); } } static bool pwq_is_empty(struct pool_workqueue *pwq) { return !pwq->nr_active && list_empty(&pwq->inactive_works); } static void __pwq_activate_work(struct pool_workqueue *pwq, struct work_struct *work) { unsigned long *wdb = work_data_bits(work); WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); } static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { int max = READ_ONCE(nna->max); int old = atomic_read(&nna->nr); do { if (old >= max) return false; } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1)); return true; } /** * pwq_tryinc_nr_active - Try to increment nr_active for a pwq * @pwq: pool_workqueue of interest * @fill: max_active may have increased, try to increase concurrency level * * Try to increment nr_active for @pwq. Returns %true if an nr_active count is * successfully obtained. %false otherwise. */ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) { struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); bool obtained = false; lockdep_assert_held(&pool->lock); if (!nna) { /* BH or per-cpu workqueue, pwq->nr_active is sufficient */ obtained = pwq->nr_active < READ_ONCE(wq->max_active); goto out; } if (unlikely(pwq->plugged)) return false; /* * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is * already waiting on $nna, pwq_dec_nr_active() will maintain the * concurrency level. Don't jump the line. * * We need to ignore the pending test after max_active has increased as * pwq_dec_nr_active() can only maintain the concurrency level but not * increase it. This is indicated by @fill. */ if (!list_empty(&pwq->pending_node) && likely(!fill)) goto out; obtained = tryinc_node_nr_active(nna); if (obtained) goto out; /* * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs * and try again. The smp_mb() is paired with the implied memory barrier * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either * we see the decremented $nna->nr or they see non-empty * $nna->pending_pwqs. */ raw_spin_lock(&nna->lock); if (list_empty(&pwq->pending_node)) list_add_tail(&pwq->pending_node, &nna->pending_pwqs); else if (likely(!fill)) goto out_unlock; smp_mb(); obtained = tryinc_node_nr_active(nna); /* * If @fill, @pwq might have already been pending. Being spuriously * pending in cold paths doesn't affect anything. Let's leave it be. */ if (obtained && likely(!fill)) list_del_init(&pwq->pending_node); out_unlock: raw_spin_unlock(&nna->lock); out: if (obtained) pwq->nr_active++; return obtained; } /** * pwq_activate_first_inactive - Activate the first inactive work item on a pwq * @pwq: pool_workqueue of interest * @fill: max_active may have increased, try to increase concurrency level * * Activate the first inactive work item of @pwq if available and allowed by * max_active limit. * * Returns %true if an inactive work item has been activated. %false if no * inactive work item is found or max_active limit is reached. */ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) { struct work_struct *work = list_first_entry_or_null(&pwq->inactive_works, struct work_struct, entry); if (work && pwq_tryinc_nr_active(pwq, fill)) { __pwq_activate_work(pwq, work); return true; } else { return false; } } /** * unplug_oldest_pwq - unplug the oldest pool_workqueue * @wq: workqueue_struct where its oldest pwq is to be unplugged * * This function should only be called for ordered workqueues where only the * oldest pwq is unplugged, the others are plugged to suspend execution to * ensure proper work item ordering:: * * dfl_pwq --------------+ [P] - plugged * | * v * pwqs -> A -> B [P] -> C [P] (newest) * | | | * 1 3 5 * | | | * 2 4 6 * * When the oldest pwq is drained and removed, this function should be called * to unplug the next oldest one to start its work item execution. Note that * pwq's are linked into wq->pwqs with the oldest first, so the first one in * the list is the oldest. */ static void unplug_oldest_pwq(struct workqueue_struct *wq) { struct pool_workqueue *pwq; lockdep_assert_held(&wq->mutex); /* Caller should make sure that pwqs isn't empty before calling */ pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue, pwqs_node); raw_spin_lock_irq(&pwq->pool->lock); if (pwq->plugged) { pwq->plugged = false; if (pwq_activate_first_inactive(pwq, true)) kick_pool(pwq->pool); } raw_spin_unlock_irq(&pwq->pool->lock); } /** * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active * @nna: wq_node_nr_active to activate a pending pwq for * @caller_pool: worker_pool the caller is locking * * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. * @caller_pool may be unlocked and relocked to lock other worker_pools. */ static void node_activate_pending_pwq(struct wq_node_nr_active *nna, struct worker_pool *caller_pool) { struct worker_pool *locked_pool = caller_pool; struct pool_workqueue *pwq; struct work_struct *work; lockdep_assert_held(&caller_pool->lock); raw_spin_lock(&nna->lock); retry: pwq = list_first_entry_or_null(&nna->pending_pwqs, struct pool_workqueue, pending_node); if (!pwq) goto out_unlock; /* * If @pwq is for a different pool than @locked_pool, we need to lock * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock * / lock dance. For that, we also need to release @nna->lock as it's * nested inside pool locks. */ if (pwq->pool != locked_pool) { raw_spin_unlock(&locked_pool->lock); locked_pool = pwq->pool; if (!raw_spin_trylock(&locked_pool->lock)) { raw_spin_unlock(&nna->lock); raw_spin_lock(&locked_pool->lock); raw_spin_lock(&nna->lock); goto retry; } } /* * $pwq may not have any inactive work items due to e.g. cancellations. * Drop it from pending_pwqs and see if there's another one. */ work = list_first_entry_or_null(&pwq->inactive_works, struct work_struct, entry); if (!work) { list_del_init(&pwq->pending_node); goto retry; } /* * Acquire an nr_active count and activate the inactive work item. If * $pwq still has inactive work items, rotate it to the end of the * pending_pwqs so that we round-robin through them. This means that * inactive work items are not activated in queueing order which is fine * given that there has never been any ordering across different pwqs. */ if (likely(tryinc_node_nr_active(nna))) { pwq->nr_active++; __pwq_activate_work(pwq, work); if (list_empty(&pwq->inactive_works)) list_del_init(&pwq->pending_node); else list_move_tail(&pwq->pending_node, &nna->pending_pwqs); /* if activating a foreign pool, make sure it's running */ if (pwq->pool != caller_pool) kick_pool(pwq->pool); } out_unlock: raw_spin_unlock(&nna->lock); if (locked_pool != caller_pool) { raw_spin_unlock(&locked_pool->lock); raw_spin_lock(&caller_pool->lock); } } /** * pwq_dec_nr_active - Retire an active count * @pwq: pool_workqueue of interest * * Decrement @pwq's nr_active and try to activate the first inactive work item. * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. */ static void pwq_dec_nr_active(struct pool_workqueue *pwq) { struct worker_pool *pool = pwq->pool; struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); lockdep_assert_held(&pool->lock); /* * @pwq->nr_active should be decremented for both percpu and unbound * workqueues. */ pwq->nr_active--; /* * For a percpu workqueue, it's simple. Just need to kick the first * inactive work item on @pwq itself. */ if (!nna) { pwq_activate_first_inactive(pwq, false); return; } /* * If @pwq is for an unbound workqueue, it's more complicated because * multiple pwqs and pools may be sharing the nr_active count. When a * pwq needs to wait for an nr_active count, it puts itself on * $nna->pending_pwqs. The following atomic_dec_return()'s implied * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to * guarantee that either we see non-empty pending_pwqs or they see * decremented $nna->nr. * * $nna->max may change as CPUs come online/offline and @pwq->wq's * max_active gets updated. However, it is guaranteed to be equal to or * larger than @pwq->wq->min_active which is above zero unless freezing. * This maintains the forward progress guarantee. */ if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) return; if (!list_empty(&nna->pending_pwqs)) node_activate_pending_pwq(nna, pool); } /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. * * NOTE: * For unbound workqueues, this function may temporarily drop @pwq->pool->lock * and thus should be called after all other state updates for the in-flight * work item is complete. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { int color = get_work_color(work_data); if (!(work_data & WORK_STRUCT_INACTIVE)) pwq_dec_nr_active(pwq); pwq->nr_in_flight[color]--; /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; /* * If this was the last pwq, wake up the first flusher. It * will handle the rest. */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); out_put: put_pwq(pwq); } /** * try_to_grab_pending - steal work item from worklist and disable irq * @work: work item to steal * @cflags: %WORK_CANCEL_ flags * @irq_flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any * stable state - idle, on timer or on worklist. * * Return: * * ======== ================================================================ * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * ======== ================================================================ * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting * interrupted while holding PENDING and @work off queue, irq must be * disabled on entry. This, combined with delayed_work->timer being * irqsafe, ensures that we return -EAGAIN for finite short period of time. * * On successful return, >= 0, irq is disabled and the caller is * responsible for releasing it using local_irq_restore(*@irq_flags). * * This function is safe to call from any context including IRQ handler. */ static int try_to_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { struct worker_pool *pool; struct pool_workqueue *pwq; local_irq_save(*irq_flags); /* try to steal the timer if it exists */ if (cflags & WORK_CANCEL_DELAYED) { struct delayed_work *dwork = to_delayed_work(work); /* * dwork->timer is irqsafe. If timer_delete() fails, it's * guaranteed that the timer is not queued anywhere and not * running on the local CPU. */ if (likely(timer_delete(&dwork->timer))) return 1; } /* try to claim PENDING the normal way */ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ pool = get_work_pool(work); if (!pool) goto fail; raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point * to pwq on queueing and to pool on dequeueing are done under * pwq->pool->lock. This in turn guarantees that, if work->data * points to pwq which is associated with a locked pool, the work * item is currently queued on that pool. */ pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { unsigned long work_data = *work_data_bits(work); debug_work_deactivate(work); /* * A cancelable inactive work item must be in the * pwq->inactive_works since a queued barrier can't be * canceled (see the comments in insert_wq_barrier()). * * An inactive work item cannot be deleted directly because * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Move the linked * barrier work items to the worklist when deleting the grabbed * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that * it doesn't participate in nr_active management in later * pwq_dec_nr_in_flight(). */ if (work_data & WORK_STRUCT_INACTIVE) move_linked_works(work, &pwq->pool->worklist, NULL); list_del_init(&work->entry); /* * work->data points to pwq iff queued. Let's point to pool. As * this destroys work->data needed by the next step, stash it. */ set_work_pool_and_keep_pending(work, pool->id, pool_offq_flags(pool)); /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*irq_flags); return -EAGAIN; } /** * work_grab_pending - steal work item from worklist and disable irq * @work: work item to steal * @cflags: %WORK_CANCEL_ flags * @irq_flags: place to store IRQ state * * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer * or on worklist. * * Can be called from any context. IRQ is disabled on return with IRQ state * stored in *@irq_flags. The caller is responsible for re-enabling it using * local_irq_restore(). * * Returns %true if @work was pending. %false if idle. */ static bool work_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { int ret; while (true) { ret = try_to_grab_pending(work, cflags, irq_flags); if (ret >= 0) return ret; cpu_relax(); } } /** * insert_work - insert a work into a pool * @pwq: pwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to * work_struct flags. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); } /* * Test whether @work is being queued from another work executing on the * same workqueue. */ static bool is_chained_work(struct workqueue_struct *wq) { struct worker *worker; worker = current_wq_worker(); /* * Return %true iff I'm a worker executing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; } /* * When queueing an unbound work item to a wq, prefer local CPU if allowed * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to * avoid perturbing sensitive tasks. */ static int wq_select_unbound_cpu(int cpu) { int new_cpu; if (likely(!wq_debug_force_rr_cpu)) { if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) return cpu; } else { pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) return cpu; __this_cpu_write(wq_rr_cpu_last, new_cpu); return new_cpu; } static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; /* * While a work item is PENDING && off queue, a task trying to * steal the PENDING will busy-loop waiting for it to either get * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ lockdep_assert_irqs_disabled(); /* * For a draining wq, only works from the same workqueue are * allowed. The __WQ_DESTROYING helps to spot the issue that * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", work->func, wq->name))) { return; } rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); pool = pwq->pool; /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. * * For ordered workqueue, work items must be queued on the newest pwq * for accurate order management. Guaranteed order also guarantees * non-reentrancy. See the comments above unplug_oldest_pwq(). */ last_pool = get_work_pool(work); if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) { struct worker *worker; raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; pool = pwq->pool; WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); raw_spin_lock(&pool->lock); } } else { raw_spin_lock(&pool->lock); } /* * pwq is determined and locked. For unbound pools, we could have raced * with pwq release and it could already be dead. If its refcnt is zero, * repeat pwq selection. Note that unbound pwqs never die without * another pwq replacing it in cpu_pwq or while work items are executing * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } /* oops */ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", wq->name, cpu); } /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); if (WARN_ON(!list_empty(&work->entry))) goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); /* * Limit the number of concurrently active work items to max_active. * @work must also queue behind existing inactive work items to maintain * ordering when max_active changes. See wq_adjust_max_active(). */ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; trace_workqueue_activate_work(work); insert_work(pwq, work, &pool->worklist, work_flags); kick_pool(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; insert_work(pwq, work, &pwq->inactive_works, work_flags); } out: raw_spin_unlock(&pool->lock); rcu_read_unlock(); } static bool clear_pending_if_disabled(struct work_struct *work) { unsigned long data = *work_data_bits(work); struct work_offq_data offqd; if (likely((data & WORK_STRUCT_PWQ) || !(data & WORK_OFFQ_DISABLE_MASK))) return false; work_offqd_unpack(&offqd, data); set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); return true; } /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it * can't go away. Callers that fail to ensure that the specified * CPU cannot go away will execute on a randomly chosen CPU. * But note well that callers specifying a CPU that never has been * online will get a splat. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { bool ret = false; unsigned long irq_flags; local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !clear_pending_if_disabled(work)) { __queue_work(cpu, wq, work); ret = true; } local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL(queue_work_on); /** * select_numa_node_cpu - Select a CPU based on NUMA node * @node: NUMA node ID that we want to select a CPU from * * This function will attempt to find a "random" cpu available on a given * node. If there are no CPUs available on the given node it will return * WORK_CPU_UNBOUND indicating that we should just schedule to any * available CPU if we need to schedule this work. */ static int select_numa_node_cpu(int node) { int cpu; /* Delay binding to CPU if node is not valid or online */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) return WORK_CPU_UNBOUND; /* Use local node/cpu if we are already there */ cpu = raw_smp_processor_id(); if (node == cpu_to_node(cpu)) return cpu; /* Use "random" otherwise know as "first" online CPU of node */ cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); /* If CPU is valid return that, otherwise just defer */ return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; } /** * queue_work_node - queue work on a "random" cpu for a given NUMA node * @node: NUMA node that we are targeting the work for * @wq: workqueue to use * @work: work to queue * * We queue the work to a "random" CPU within a given NUMA node. The basic * idea here is to provide a way to somehow associate work with a given * NUMA node. * * This function will only make a best effort attempt at getting this onto * the right NUMA node. If no node is requested or the requested node is * offline then we just fall back to standard queue_work behavior. * * Currently the "random" CPU ends up being the first available CPU in the * intersection of cpu_online_mask and the cpumask of the node, unless we * are running on the node. In that case we just use the current CPU. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work) { unsigned long irq_flags; bool ret = false; /* * This current implementation is specific to unbound workqueues. * Specifically we only return the first available CPU for a given * node instead of cycling through individual CPUs within the node. * * If this is used with a per-cpu workqueue then the logic in * workqueue_select_cpu_near would need to be updated to allow for * some round robin type logic. */ WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !clear_pending_if_disabled(work)) { int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); ret = true; } local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(queue_work_node); void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = timer_container_of(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); } EXPORT_SYMBOL(delayed_work_timer_fn); static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { __queue_work(cpu, wq, &dwork->work); return; } WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu)); dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; if (housekeeping_enabled(HK_TYPE_TIMER)) { /* If the current cpu is a housekeeping cpu, use it. */ cpu = smp_processor_id(); if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER)) cpu = housekeeping_any_cpu(HK_TYPE_TIMER); add_timer_on(timer, cpu); } else { if (likely(cpu == WORK_CPU_UNBOUND)) add_timer_global(timer); else add_timer_on(timer, cpu); } } /** * queue_delayed_work_on - queue work on specific CPU after delay * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * We queue the delayed_work to a specific CPU, for non-zero delays the * caller must ensure it is online and can't go away. Callers that fail * to ensure this, may get @dwork->timer queued to an offlined CPU and * this will prevent queueing of @dwork->work unless the offlined CPU * becomes online again. * * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. */ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct work_struct *work = &dwork->work; bool ret = false; unsigned long irq_flags; /* read the comment in __queue_work() */ local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !clear_pending_if_disabled(work)) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is * zero, @work is guaranteed to be scheduled immediately regardless of its * current state. * * Return: %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * * This function is safe to call from any context including IRQ handler. * See try_to_grab_pending() for details. */ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long irq_flags; bool ret; ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); if (!clear_pending_if_disabled(&dwork->work)) __queue_delayed_work(cpu, wq, dwork, delay); local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); static void rcu_work_rcufn(struct rcu_head *rcu) { struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); /* read the comment in __queue_work() */ local_irq_disable(); __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); local_irq_enable(); } /** * queue_rcu_work - queue work after a RCU grace period * @wq: workqueue to use * @rwork: work to queue * * Return: %false if @rwork was already pending, %true otherwise. Note * that a full RCU grace period is guaranteed only after a %true return. * While @rwork is guaranteed to be executed after a %false return, the * execution may happen before a full RCU grace period has passed. */ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) { struct work_struct *work = &rwork->work; /* * rcu_work can't be canceled or disabled. Warn if the user reached * inside @rwork and disabled the inner work. */ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !WARN_ON_ONCE(clear_pending_if_disabled(work))) { rwork->wq = wq; call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; } return false; } EXPORT_SYMBOL(queue_rcu_work); static struct worker *alloc_worker(int node) { struct worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } return worker; } static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) { if (pool->cpu < 0 && pool->attrs->affn_strict) return pool->attrs->__pod_cpumask; else return pool->attrs->cpumask; } /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached * @pool: the target pool * * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and * cpu-binding of @worker are kept coordinated with the pool across * cpu-[un]hotplugs. */ static void worker_attach_to_pool(struct worker *worker, struct worker_pool *pool) { mutex_lock(&wq_pool_attach_mutex); /* * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable * across this function. See the comments above the flag definition for * details. BH workers are, while per-CPU, always DISASSOCIATED. */ if (pool->flags & POOL_DISASSOCIATED) { worker->flags |= WORKER_UNBOUND; } else { WARN_ON_ONCE(pool->flags & POOL_BH); kthread_set_per_cpu(worker->task, pool->cpu); } if (worker->rescue_wq) set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; mutex_unlock(&wq_pool_attach_mutex); } static void unbind_worker(struct worker *worker) { lockdep_assert_held(&wq_pool_attach_mutex); kthread_set_per_cpu(worker->task, -1); if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); else WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); } static void detach_worker(struct worker *worker) { lockdep_assert_held(&wq_pool_attach_mutex); unbind_worker(worker); list_del(&worker->node); } /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool * * Undo the attaching which had been done in worker_attach_to_pool(). The * caller worker shouldn't access to the pool after detached except it has * other reference to the pool. */ static void worker_detach_from_pool(struct worker *worker) { struct worker_pool *pool = worker->pool; /* there is one permanent BH worker per CPU which should never detach */ WARN_ON_ONCE(pool->flags & POOL_BH); mutex_lock(&wq_pool_attach_mutex); detach_worker(worker); worker->pool = NULL; mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); } static int format_worker_id(char *buf, size_t size, struct worker *worker, struct worker_pool *pool) { if (worker->rescue_wq) return scnprintf(buf, size, "kworker/R-%s", worker->rescue_wq->name); if (pool) { if (pool->cpu >= 0) return scnprintf(buf, size, "kworker/%d:%d%s", pool->cpu, worker->id, pool->attrs->nice < 0 ? "H" : ""); else return scnprintf(buf, size, "kworker/u%d:%d", pool->id, worker->id); } else { return scnprintf(buf, size, "kworker/dying"); } } /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * * Create and start a new worker which is attached to @pool. * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. * * Return: * Pointer to the newly created worker. */ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); if (id < 0) { pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", ERR_PTR(id)); return NULL; } worker = alloc_worker(pool->node); if (!worker) { pr_err_once("workqueue: Failed to allocate a worker\n"); goto fail; } worker->id = id; if (!(pool->flags & POOL_BH)) { char id_buf[WORKER_ID_LEN]; format_worker_id(id_buf, sizeof(id_buf), worker, pool); worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "%s", id_buf); if (IS_ERR(worker->task)) { if (PTR_ERR(worker->task) == -EINTR) { pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n", id_buf); } else { pr_err_once("workqueue: Failed to create a worker thread: %pe", worker->task); } goto fail; } set_user_nice(worker->task, pool->attrs->nice); kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); } /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); /* start the newly created worker */ raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); /* * @worker is waiting on a completion in kthread() and will trigger hung * check if not woken up soon. As kick_pool() is noop if @pool is empty, * wake it up explicitly. */ if (worker->task) wake_up_process(worker->task); raw_spin_unlock_irq(&pool->lock); return worker; fail: ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } static void detach_dying_workers(struct list_head *cull_list) { struct worker *worker; list_for_each_entry(worker, cull_list, entry) detach_worker(worker); } static void reap_dying_workers(struct list_head *cull_list) { struct worker *worker, *tmp; list_for_each_entry_safe(worker, tmp, cull_list, entry) { list_del_init(&worker->entry); kthread_stop_put(worker->task); kfree(worker); } } /** * set_worker_dying - Tag a worker for destruction * @worker: worker to be destroyed * @list: transfer worker away from its pool->idle_list and into list * * Tag @worker for destruction and adjust @pool stats accordingly. The worker * should be idle. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void set_worker_dying(struct worker *worker, struct list_head *list) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); lockdep_assert_held(&wq_pool_attach_mutex); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || WARN_ON(!list_empty(&worker->scheduled)) || WARN_ON(!(worker->flags & WORKER_IDLE))) return; pool->nr_workers--; pool->nr_idle--; worker->flags |= WORKER_DIE; list_move(&worker->entry, list); /* get an extra task struct reference for later kthread_stop_put() */ get_task_struct(worker->task); } /** * idle_worker_timeout - check if some idle workers can now be deleted. * @t: The pool's idle_timer that just expired * * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in * worker_leave_idle(), as a worker flicking between idle and active while its * pool is at the too_many_workers() tipping point would cause too much timer * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let * it expire and re-evaluate things from there. */ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = timer_container_of(pool, t, idle_timer); bool do_cull = false; if (work_pending(&pool->idle_cull_work)) return; raw_spin_lock_irq(&pool->lock); if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; do_cull = !time_before(jiffies, expires); if (!do_cull) mod_timer(&pool->idle_timer, expires); } raw_spin_unlock_irq(&pool->lock); if (do_cull) queue_work(system_unbound_wq, &pool->idle_cull_work); } /** * idle_cull_fn - cull workers that have been idle for too long. * @work: the pool's work for handling these idle workers * * This goes through a pool's idle workers and gets rid of those that have been * idle for at least IDLE_WORKER_TIMEOUT seconds. * * We don't want to disturb isolated CPUs because of a pcpu kworker being * culled, so this also resets worker affinity. This requires a sleepable * context, hence the split between timer callback and work item. */ static void idle_cull_fn(struct work_struct *work) { struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); LIST_HEAD(cull_list); /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker * cannot proceed beyong set_pf_worker() in its self-destruct path. * This is required as a previously-preempted worker could run after * set_worker_dying() has happened but before detach_dying_workers() did. */ mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); break; } set_worker_dying(worker, &cull_list); } raw_spin_unlock_irq(&pool->lock); detach_dying_workers(&cull_list); mutex_unlock(&wq_pool_attach_mutex); reap_dying_workers(&cull_list); } static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq_mayday_lock); if (!wq->rescuer) return; /* mayday mayday mayday */ if (list_empty(&pwq->mayday_node)) { /* * If @pwq is for an unbound wq, its base ref may be put at * any time due to an attribute change. Pin @pwq until the * rescuer is done with it. */ get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); pwq->stats[PWQ_STAT_MAYDAY]++; } } static void pool_mayday_timeout(struct timer_list *t) { struct worker_pool *pool = timer_container_of(pool, t, mayday_timer); struct work_struct *work; raw_spin_lock_irq(&pool->lock); raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } raw_spin_unlock(&wq_mayday_lock); raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary * @pool: pool to create a new worker for * * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be %false and * may_start_working() %true. * * LOCKING: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { restart: raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { if (create_worker(pool) || !need_to_create_worker(pool)) break; schedule_timeout_interruptible(CREATE_COOLDOWN); if (!need_to_create_worker(pool)) break; } timer_delete_sync(&pool->mayday_timer); raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have * already become busy. */ if (need_to_create_worker(pool)) goto restart; } /** * manage_workers - manage worker pool * @worker: self * * Assume the manager role and manage the worker pool @worker belongs * to. At any given time, there can be only zero or one manager per * pool. The exclusion is handled automatically by this function. * * The caller can safely start processing works on false return. On * true return, it's guaranteed that need_to_create_worker() is false * and may_start_working() is true. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: * %false if the pool doesn't need management and the caller can safely * start processing works, %true if management function was performed and * the conditions that the caller verified before calling the function may * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; if (pool->flags & POOL_MANAGER_ACTIVE) return false; pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; rcuwait_wake_up(&manager_wait); return true; } /** * process_one_work - process single work * @worker: self * @work: work to process * * Process @work. This function contains all the logics necessary to * process a single work including synchronization against and * interaction with other workers on the same cpu, queueing and * flushing. As long as context requirement is met, any worker can * call this function to process a work. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) __acquires(&pool->lock) { struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; unsigned long work_data; int lockdep_start_depth, rcu_start_depth; bool bh_draining = pool->flags & POOL_BH_DRAINING; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from * inside the function that is called from it, this we need to * take into account for lockdep too. To avoid bogus "held * lock freed" warnings as well as problems when looking into * work->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* ensure we're on the correct CPU */ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; if (worker->task) worker->current_at = worker->task->se.sum_exec_runtime; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get * overridden through set_worker_desc(). */ strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); list_del_init(&work->entry); /* * CPU intensive works don't participate in concurrency management. * They're the scheduler's responsibility. This takes @worker out * of concurrency management and the next code block will chain * execution of the pending work items. */ if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* * Kick @pool if necessary. It's always noop for per-cpu worker pools * since nr_running would always be >= 1 at this point. This is used to * chain execution of the pending work items for WORKER_NOT_RUNNING * workers such as the UNBOUND and CPU_INTENSIVE ones. */ kick_pool(pool); /* * Record the last pool and clear PENDING which should be the last * update to @work. Also, do this inside @pool->lock so that * PENDING and queued state changes happen together while IRQ is * disabled. */ set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); rcu_start_depth = rcu_preempt_depth(); lockdep_start_depth = lockdep_depth(current); /* see drain_dead_softirq_workfn() */ if (!bh_draining) lock_map_acquire(pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding * any locks, that is, before these two lock_map_acquire()'s. * * However, that would result in: * * A(W1) * WFC(C) * A(W1) * C(C) * * Which would create W1->C->W1 dependencies, even though there is no * actual deadlock possible. There are two solutions, using a * read-recursive acquire on the work(queue) 'locks', but this will then * hit the lockdep limitation on recursive locks, or simply discard * these locks. * * AFAICT there is no possible deadlock scenario between the * flush_work() and complete() primitives (except for single-threaded * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); trace_workqueue_execute_start(work); worker->current_func(work); /* * While we must be careful to not use "work" after this, the trace * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); lock_map_release(&lockdep_map); if (!bh_draining) lock_map_release(pwq->wq->lockdep_map); if (unlikely((worker->task && in_atomic()) || lockdep_depth(current) != lockdep_start_depth || rcu_preempt_depth() != rcu_start_depth)) { pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n" " preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n", current->comm, task_pid_nr(current), preempt_count(), lockdep_start_depth, lockdep_depth(current), rcu_start_depth, rcu_preempt_depth(), worker->current_func); debug_show_held_locks(current); dump_stack(); } /* * The following prevents a kworker from hogging CPU on !PREEMPTION * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ if (worker->task) cond_resched(); raw_spin_lock_irq(&pool->lock); pwq->stats[PWQ_STAT_COMPLETED]++; /* * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked * CPU intensive by wq_worker_tick() if @work hogged CPU longer than * wq_cpu_intensive_thresh_us. Clear it. */ worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* tag the worker for identification in schedule() */ worker->last_func = worker->current_func; /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; worker->current_color = INT_MAX; /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); } /** * process_scheduled_works - process scheduled works * @worker: self * * Process all scheduled works. Please note that the scheduled list * may change while processing a work, so this function repeatedly * fetches a work from the top and executes it. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) { struct work_struct *work; bool first = true; while ((work = list_first_entry_or_null(&worker->scheduled, struct work_struct, entry))) { if (first) { worker->pool->watchdog_ts = jiffies; first = false; } process_one_work(worker, work); } } static void set_pf_worker(bool val) { mutex_lock(&wq_pool_attach_mutex); if (val) current->flags |= PF_WQ_WORKER; else current->flags &= ~PF_WQ_WORKER; mutex_unlock(&wq_pool_attach_mutex); } /** * worker_thread - the worker thread function * @__worker: self * * The worker thread function. All workers belong to a worker_pool - * either a per-cpu one or dynamic unbound one. These workers process all * work items regardless of their specific target workqueue. The only * exception is work items which belong to workqueues with a rescuer which * will be explained in rescuer_thread(). * * Return: 0 */ static int worker_thread(void *__worker) { struct worker *worker = __worker; struct worker_pool *pool = worker->pool; /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); /* * The worker is dead and PF_WQ_WORKER is cleared, worker->pool * shouldn't be accessed, reset it to NULL in case otherwise. */ worker->pool = NULL; ida_free(&pool->worker_ida, worker->id); return 0; } worker_leave_idle(worker); recheck: /* no more worker necessary? */ if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* * ->scheduled list can only be filled while a worker is * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* * Finish PREP stage. We're guaranteed to have at least one idle * worker or that someone else has already assumed the manager * role. This is where @worker starts participating in concurrency * management if applicable and concurrency management is restored * after being rebound. See rebind_workers() for details. */ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); if (assign_work(work, worker, NULL)) process_scheduled_works(worker); } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); sleep: /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding * pool->lock or from local cpu, so setting the current state * before releasing pool->lock is enough to prevent losing any * event. */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } /** * rescuer_thread - the rescuer thread function * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of * developing into deadlock if some works currently on the same queue * need to be processed to satisfy the GFP_KERNEL allocation. This is * the problem rescuer solves. * * When such condition is possible, the pool summons rescuers of all * workqueues which have works queued on the pool and let them process * those works so that forward progress can be guaranteed. * * This should happen rarely. * * Return: 0 */ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); /* * Mark rescuer as worker too. As WORKER_PREP is never cleared, it * doesn't participate in concurrency management. */ set_pf_worker(true); repeat: set_current_state(TASK_IDLE); /* * By the time the rescuer is requested to stop, the workqueue * shouldn't have any work pending, but @wq->maydays may still have * pwq(s) queued. This can happen by non-rescuer workers consuming * all the work items before the rescuer got to them. Go through * @wq->maydays processing before acting on should_stop so that the * list is always empty on exit. */ should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and * process'em. */ WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) pwq->stats[PWQ_STAT_RESCUED]++; } if (!list_empty(&rescuer->scheduled)) { process_scheduled_works(rescuer); /* * The above execution of rescued work items could * have created more to rescue through * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. */ if (wq->rescuer && list_empty(&pwq->mayday_node)) { get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } raw_spin_unlock(&wq_mayday_lock); } } /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. */ kick_pool(pool); raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); /* * Put the reference grabbed by send_mayday(). @pool might * go away any time after it. */ put_pwq_unlocked(pwq); raw_spin_lock_irq(&wq_mayday_lock); } raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); set_pf_worker(false); return 0; } /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); goto repeat; } static void bh_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; int nr_restarts = BH_WORKER_RESTARTS; unsigned long end = jiffies + BH_WORKER_JIFFIES; raw_spin_lock_irq(&pool->lock); worker_leave_idle(worker); /* * This function follows the structure of worker_thread(). See there for * explanations on each step. */ if (!need_more_worker(pool)) goto done; WARN_ON_ONCE(!list_empty(&worker->scheduled)); worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); if (assign_work(work, worker, NULL)) process_scheduled_works(worker); } while (keep_working(pool) && --nr_restarts && time_before(jiffies, end)); worker_set_flags(worker, WORKER_PREP); done: worker_enter_idle(worker); kick_pool(pool); raw_spin_unlock_irq(&pool->lock); } /* * TODO: Convert all tasklet users to workqueue and use softirq directly. * * This is currently called from tasklet[_hi]action() and thus is also called * whenever there are tasklets to run. Let's do an early exit if there's nothing * queued. Once conversion from tasklet is complete, the need_more_worker() test * can be dropped. * * After full conversion, we'll add worker->softirq_action, directly use the * softirq action and obtain the worker pointer from the softirq_action pointer. */ void workqueue_softirq_action(bool highpri) { struct worker_pool *pool = &per_cpu(bh_worker_pools, smp_processor_id())[highpri]; if (need_more_worker(pool)) bh_worker(list_first_entry(&pool->workers, struct worker, node)); } struct wq_drain_dead_softirq_work { struct work_struct work; struct worker_pool *pool; struct completion done; }; static void drain_dead_softirq_workfn(struct work_struct *work) { struct wq_drain_dead_softirq_work *dead_work = container_of(work, struct wq_drain_dead_softirq_work, work); struct worker_pool *pool = dead_work->pool; bool repeat; /* * @pool's CPU is dead and we want to execute its still pending work * items from this BH work item which is running on a different CPU. As * its CPU is dead, @pool can't be kicked and, as work execution path * will be nested, a lockdep annotation needs to be suppressed. Mark * @pool with %POOL_BH_DRAINING for the special treatments. */ raw_spin_lock_irq(&pool->lock); pool->flags |= POOL_BH_DRAINING; raw_spin_unlock_irq(&pool->lock); bh_worker(list_first_entry(&pool->workers, struct worker, node)); raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_BH_DRAINING; repeat = need_more_worker(pool); raw_spin_unlock_irq(&pool->lock); /* * bh_worker() might hit consecutive execution limit and bail. If there * still are pending work items, reschedule self and return so that we * don't hog this CPU's BH. */ if (repeat) { if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) queue_work(system_bh_highpri_wq, work); else queue_work(system_bh_wq, work); } else { complete(&dead_work->done); } } /* * @cpu is dead. Drain the remaining BH work items on the current CPU. It's * possible to allocate dead_work per CPU and avoid flushing. However, then we * have to worry about draining overlapping with CPU coming back online or * nesting (one CPU's dead_work queued on another CPU which is also dead and so * on). Let's keep it simple and drain them synchronously. These are BH work * items which shouldn't be requeued on the same pool. Shouldn't take long. */ void workqueue_softirq_dead(unsigned int cpu) { int i; for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i]; struct wq_drain_dead_softirq_work dead_work; if (!need_more_worker(pool)) continue; INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn); dead_work.pool = pool; init_completion(&dead_work.done); if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) queue_work(system_bh_highpri_wq, &dead_work.work); else queue_work(system_bh_wq, &dead_work.work); wait_for_completion(&dead_work.done); destroy_work_on_stack(&dead_work.work); } } /** * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed * @target_work: work item being flushed (NULL for workqueue flushes) * @from_cancel: are we called from the work cancel path * * %current is trying to flush the whole @target_wq or @target_work on it. * If this is not the cancel path (which implies work being flushed is either * already running, or will not be at all), check if @target_wq doesn't have * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward- * progress guarantee leading to a deadlock. */ static void check_flush_dependency(struct workqueue_struct *target_wq, struct work_struct *target_work, bool from_cancel) { work_func_t target_func; struct worker *worker; if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM) return; worker = current_wq_worker(); target_func = target_work ? target_work->func : NULL; WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } struct wq_barrier { struct work_struct work; struct completion done; struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) { struct wq_barrier *barr = container_of(work, struct wq_barrier, work); complete(&barr->done); } /** * insert_wq_barrier - insert a barrier work * @pwq: pwq to insert barrier into * @barr: wq_barrier to insert * @target: target work to attach @barr to * @worker: worker currently executing @target, NULL if @target is not executing * * @barr is linked to @target such that @barr is completed only after * @target finishes execution. Please note that the ordering * guarantee is observed only with respect to @target and on the local * cpu. * * Currently, a queued barrier can't be canceled. This is because * try_to_grab_pending() can't determine whether the work to be * grabbed is at the head of the queue and thus can't clear LINKED * flag of the previous work while there must be a valid next work * after a work with LINKED flag set. * * Note that when @worker is non-NULL, @target may be modified * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { static __maybe_unused struct lock_class_key bh_key, thr_key; unsigned int work_flags = 0; unsigned int work_color; struct list_head *head; /* * debugobject calls are safe here even with pool->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. * * BH and threaded workqueues need separate lockdep keys to avoid * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} * usage". */ INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func, (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion_map(&barr->done, &target->lockdep_map); barr->task = current; /* The barrier work item does not participate in nr_active. */ work_flags |= WORK_STRUCT_INACTIVE; /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ if (worker) { head = worker->scheduled.next; work_color = worker->current_color; } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ work_flags |= *bits & WORK_STRUCT_LINKED; work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } pwq->nr_in_flight[work_color]++; work_flags |= work_color_to_flags(work_color); insert_work(pwq, &barr->work, head, work_flags); } /** * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing * @wq: workqueue being flushed * @flush_color: new flush color, < 0 for no-op * @work_color: new work color, < 0 for no-op * * Prepare pwqs for workqueue flushing. * * If @flush_color is non-negative, flush_color on all pwqs should be * -1. If no pwq has in-flight commands at the specified color, all * pwq->flush_color's stay at -1 and %false is returned. If any pwq * has in flight commands, its pwq->flush_color is set to * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq * wakeup logic is armed and %true is returned. * * The caller should have initialized @wq->first_flusher prior to * calling this function with non-negative @flush_color. If * @flush_color is negative, no flush color update is done and %false * is returned. * * If @work_color is non-negative, all pwqs should have the same * work_color which is previous to @work_color and all will be * advanced to @work_color. * * CONTEXT: * mutex_lock(wq->mutex). * * Return: * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; struct pool_workqueue *pwq; struct worker_pool *current_pool = NULL; if (flush_color >= 0) { WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } /* * For unbound workqueue, pwqs will map to only a few pools. * Most of the time, pwqs within the same pool will be linked * sequentially to wq->pwqs by cpu index. So in the majority * of pwq iters, the pool is the same, only doing lock/unlock * if the pool has changed. This can largely reduce expensive * lock operations. */ for_each_pwq(pwq, wq) { if (current_pool != pwq->pool) { if (likely(current_pool)) raw_spin_unlock_irq(¤t_pool->lock); current_pool = pwq->pool; raw_spin_lock_irq(¤t_pool->lock); } if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; atomic_inc(&wq->nr_pwqs_to_flush); wait = true; } } if (work_color >= 0) { WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color; } } if (current_pool) raw_spin_unlock_irq(¤t_pool->lock); if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); return wait; } static void touch_wq_lockdep_map(struct workqueue_struct *wq) { #ifdef CONFIG_LOCKDEP if (unlikely(!wq->lockdep_map)) return; if (wq->flags & WQ_BH) local_bh_disable(); lock_map_acquire(wq->lockdep_map); lock_map_release(wq->lockdep_map); if (wq->flags & WQ_BH) local_bh_enable(); #endif } static void touch_work_lockdep_map(struct work_struct *work, struct workqueue_struct *wq) { #ifdef CONFIG_LOCKDEP if (wq->flags & WQ_BH) local_bh_disable(); lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); if (wq->flags & WQ_BH) local_bh_enable(); #endif } /** * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)), }; int next_color; if (WARN_ON(!wq_online)) return; touch_wq_lockdep_map(wq); mutex_lock(&wq->mutex); /* * Start-to-wait phase */ next_color = work_next_color(wq->work_color); if (next_color != wq->flush_color) { /* * Color space is not full. The current work_color * becomes our flush_color and work_color is advanced * by one. */ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher; if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, wq->work_color)) { /* nothing to flush, done */ wq->flush_color = next_color; wq->first_flusher = NULL; goto out_unlock; } } else { /* wait in queue */ WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } } else { /* * Oops, color space is full, wait on overflow queue. * The next flush completion will assign us * flush_color and transfer to flusher_queue. */ list_add_tail(&this_flusher.list, &wq->flusher_overflow); } check_flush_dependency(wq, NULL, false); mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); /* * Wake-up-and-cascade phase * * First flushers are responsible for cascading flushes and * handling overflow. Non-first flushers can simply return. */ if (READ_ONCE(wq->first_flusher) != &this_flusher) return; mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) goto out_unlock; WRITE_ONCE(wq->first_flusher, NULL); WARN_ON_ONCE(!list_empty(&this_flusher.list)); WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; /* complete all the flushers sharing the current flush color */ list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { if (next->flush_color != wq->flush_color) break; list_del_init(&next->list); complete(&next->done); } WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); /* one color has been freed, handle overflow queue */ if (!list_empty(&wq->flusher_overflow)) { /* * Assign the same color to all overflowed * flushers, advance work_color and append to * flusher_queue. This is the start-to-wait * phase for these overflowed flushers. */ list_for_each_entry(tmp, &wq->flusher_overflow, list) tmp->flush_color = wq->work_color; wq->work_color = work_next_color(wq->work_color); list_splice_tail_init(&wq->flusher_overflow, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } if (list_empty(&wq->flusher_queue)) { WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } /* * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ WARN_ON_ONCE(wq->flush_color == wq->work_color); WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) break; /* * Meh... this color is already done, clear first * flusher and repeat cascading. */ wq->first_flusher = NULL; } out_unlock: mutex_unlock(&wq->mutex); } EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue * @wq: workqueue to drain * * Wait until the workqueue becomes empty. While draining is in progress, * only chain queueing is allowed. IOW, only currently pending or running * work items on @wq can queue further work items on it. @wq is flushed * repeatedly until it becomes empty. The number of flushing is determined * by the depth of chaining and should be relatively short. Whine if it * takes too long. */ void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ mutex_lock(&wq->mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: __flush_workqueue(wq); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { bool drained; raw_spin_lock_irq(&pwq->pool->lock); drained = pwq_is_empty(pwq); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) pr_warn("workqueue %s: %s() isn't complete after %u tries\n", wq->name, __func__, flush_cnt); mutex_unlock(&wq->mutex); goto reflush; } if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, bool from_cancel) { struct worker *worker = NULL; struct worker_pool *pool; struct pool_workqueue *pwq; struct workqueue_struct *wq; rcu_read_lock(); pool = get_work_pool(work); if (!pool) { rcu_read_unlock(); return false; } raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { if (unlikely(pwq->pool != pool)) goto already_gone; } else { worker = find_worker_executing_work(pool, work); if (!worker) goto already_gone; pwq = worker->current_pwq; } wq = pwq->wq; check_flush_dependency(wq, work, from_cancel); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); touch_work_lockdep_map(work, wq); /* * Force a lock recursion deadlock when using flush_work() inside a * single-threaded or rescuer equipped workqueue. * * For single threaded workqueues the deadlock happens when the work * is after the work issuing the flush_work(). For rescuer equipped * workqueues the deadlock happens when the rescuer stalls, blocking * forward progress. */ if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer)) touch_wq_lockdep_map(wq); rcu_read_unlock(); return true; already_gone: raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; if (WARN_ON(!wq_online)) return false; if (WARN_ON(!work->func)) return false; if (!start_flush_work(work, &barr, from_cancel)) return false; /* * start_flush_work() returned %true. If @from_cancel is set, we know * that @work must have been executing during start_flush_work() and * can't currently be queued. Its data must contain OFFQ bits. If @work * was queued on a BH workqueue, we also know that it was running in the * BH context and thus can be busy-waited. */ if (from_cancel) { unsigned long data = *work_data_bits(work); if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { /* * On RT, prevent a live lock when %current preempted * soft interrupt processing or prevents ksoftirqd from * running by keeping flipping BH. If the BH work item * runs on a different CPU then this has no effect other * than doing the BH disable/enable dance for nothing. * This is copied from * kernel/softirq.c::tasklet_unlock_spin_wait(). */ while (!try_wait_for_completion(&barr.done)) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { local_bh_disable(); local_bh_enable(); } else { cpu_relax(); } } goto out_destroy; } } wait_for_completion(&barr.done); out_destroy: destroy_work_on_stack(&barr.work); return true; } /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush * * Wait until @work has finished execution. @work is guaranteed to be idle * on return if it hasn't been requeued since flush started. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_work(struct work_struct *work) { might_sleep(); return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); /** * flush_delayed_work - wait for a dwork to finish executing the last queueing * @dwork: the delayed work to flush * * Delayed timer is cancelled and the pending work is queued for * immediate execution. Like flush_work(), this function only * considers the last queueing instance of @dwork. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); if (timer_delete_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); /** * flush_rcu_work - wait for a rwork to finish executing the last queueing * @rwork: the rcu work to flush * * Return: * %true if flush_rcu_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_rcu_work(struct rcu_work *rwork) { if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { rcu_barrier(); flush_work(&rwork->work); return true; } else { return flush_work(&rwork->work); } } EXPORT_SYMBOL(flush_rcu_work); static void work_offqd_disable(struct work_offq_data *offqd) { const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1; if (likely(offqd->disable < max)) offqd->disable++; else WARN_ONCE(true, "workqueue: work disable count overflowed\n"); } static void work_offqd_enable(struct work_offq_data *offqd) { if (likely(offqd->disable > 0)) offqd->disable--; else WARN_ONCE(true, "workqueue: work disable count underflowed\n"); } static bool __cancel_work(struct work_struct *work, u32 cflags) { struct work_offq_data offqd; unsigned long irq_flags; int ret; ret = work_grab_pending(work, cflags, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); if (cflags & WORK_CANCEL_DISABLE) work_offqd_disable(&offqd); set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return ret; } static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { bool ret; ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); if (*work_data_bits(work) & WORK_OFFQ_BH) WARN_ON_ONCE(in_hardirq()); else might_sleep(); /* * Skip __flush_work() during early boot when we know that @work isn't * executing. This allows canceling during early boot. */ if (wq_online) __flush_work(work, true); if (!(cflags & WORK_CANCEL_DISABLE)) enable_work(work); return ret; } /* * See cancel_delayed_work() */ bool cancel_work(struct work_struct *work) { return __cancel_work(work, 0); } EXPORT_SYMBOL(cancel_work); /** * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * * Cancel @work and wait for its execution to finish. This function can be used * even if the work re-queues itself or migrates to another workqueue. On return * from this function, @work is guaranteed to be not pending or executing on any * CPU as long as there aren't racing enqueues. * * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's. * Use cancel_delayed_work_sync() instead. * * Must be called from a sleepable context if @work was last queued on a non-BH * workqueue. Can also be called from non-hardirq atomic contexts including BH * if @work was last queued on a BH workqueue. * * Returns %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { return __cancel_work_sync(work, 0); } EXPORT_SYMBOL_GPL(cancel_work_sync); /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel * * Kill off a pending delayed_work. * * Return: %true if @dwork was pending and canceled; %false if it wasn't * pending. * * Note: * The work callback function may still be running on return, unless * it returns %true and the work doesn't re-arm itself. Explicitly flush or * use cancel_delayed_work_sync() to wait on it. * * This function is safe to call from any context including IRQ handler. */ bool cancel_delayed_work(struct delayed_work *dwork) { return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED); } EXPORT_SYMBOL(cancel_delayed_work); /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel * * This is cancel_work_sync() for delayed works. * * Return: * %true if @dwork was pending, %false otherwise. */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED); } EXPORT_SYMBOL(cancel_delayed_work_sync); /** * disable_work - Disable and cancel a work item * @work: work item to disable * * Disable @work by incrementing its disable count and cancel it if currently * pending. As long as the disable count is non-zero, any attempt to queue @work * will fail and return %false. The maximum supported disable depth is 2 to the * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. * * Can be called from any context. Returns %true if @work was pending, %false * otherwise. */ bool disable_work(struct work_struct *work) { return __cancel_work(work, WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_work); /** * disable_work_sync - Disable, cancel and drain a work item * @work: work item to disable * * Similar to disable_work() but also wait for @work to finish if currently * executing. * * Must be called from a sleepable context if @work was last queued on a non-BH * workqueue. Can also be called from non-hardirq atomic contexts including BH * if @work was last queued on a BH workqueue. * * Returns %true if @work was pending, %false otherwise. */ bool disable_work_sync(struct work_struct *work) { return __cancel_work_sync(work, WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_work_sync); /** * enable_work - Enable a work item * @work: work item to enable * * Undo disable_work[_sync]() by decrementing @work's disable count. @work can * only be queued if its disable count is 0. * * Can be called from any context. Returns %true if the disable count reached 0. * Otherwise, %false. */ bool enable_work(struct work_struct *work) { struct work_offq_data offqd; unsigned long irq_flags; work_grab_pending(work, 0, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); work_offqd_enable(&offqd); set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return !offqd.disable; } EXPORT_SYMBOL_GPL(enable_work); /** * disable_delayed_work - Disable and cancel a delayed work item * @dwork: delayed work item to disable * * disable_work() for delayed work items. */ bool disable_delayed_work(struct delayed_work *dwork) { return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_delayed_work); /** * disable_delayed_work_sync - Disable, cancel and drain a delayed work item * @dwork: delayed work item to disable * * disable_work_sync() for delayed work items. */ bool disable_delayed_work_sync(struct delayed_work *dwork) { return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_delayed_work_sync); /** * enable_delayed_work - Enable a delayed work item * @dwork: delayed work item to enable * * enable_work() for delayed work items. */ bool enable_delayed_work(struct delayed_work *dwork) { return enable_work(&dwork->work); } EXPORT_SYMBOL_GPL(enable_delayed_work); /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * * schedule_on_each_cpu() executes @func on each online CPU using the * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. * * Return: * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) { int cpu; struct work_struct __percpu *works; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); cpus_read_unlock(); free_percpu(works); return 0; } /** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must * be available when the work executes) * * Executes the function immediately if process context is available, * otherwise schedules the function for delayed execution. * * Return: 0 - function was executed * 1 - function was scheduled for execution */ int execute_in_process_context(work_func_t fn, struct execute_work *ew) { if (!in_interrupt()) { fn(&ew->work); return 0; } INIT_WORK(&ew->work, fn); schedule_work(&ew->work); return 1; } EXPORT_SYMBOL_GPL(execute_in_process_context); /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free * * Undo alloc_workqueue_attrs(). */ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); free_cpumask_var(attrs->__pod_cpumask); kfree(attrs); } } /** * alloc_workqueue_attrs - allocate a workqueue_attrs * * Allocate a new workqueue_attrs, initialize with default settings and * return it. * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ struct workqueue_attrs *alloc_workqueue_attrs_noprof(void) { struct workqueue_attrs *attrs; attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (!attrs) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); return NULL; } static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from) { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); to->affn_strict = from->affn_strict; /* * Unlike hash and equality test, copying shouldn't ignore wq-only * fields as copying is used for both pool and wq attrs. Instead, * get_unbound_pool() explicitly clears the fields. */ to->affn_scope = from->affn_scope; to->ordered = from->ordered; } /* * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the * comments in 'struct workqueue_attrs' definition. */ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; if (attrs->affn_strict) cpumask_copy(attrs->cpumask, cpu_possible_mask); } /* hash value of the content of @attr */ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { u32 hash = 0; hash = jhash_1word(attrs->nice, hash); hash = jhash_1word(attrs->affn_strict, hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); if (!attrs->affn_strict) hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } /* content equality test */ static bool wqattrs_equal(const struct workqueue_attrs *a, const struct workqueue_attrs *b) { if (a->nice != b->nice) return false; if (a->affn_strict != b->affn_strict) return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask)) return false; return true; } /* Update @attrs with actually available CPUs */ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, const cpumask_t *unbound_cpumask) { /* * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to * @unbound_cpumask. */ cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); if (unlikely(cpumask_empty(attrs->cpumask))) cpumask_copy(attrs->cpumask, unbound_cpumask); } /* find wq_pod_type to use for @attrs */ static const struct wq_pod_type * wqattrs_pod_type(const struct workqueue_attrs *attrs) { enum wq_affn_scope scope; struct wq_pod_type *pt; /* to synchronize access to wq_affn_dfl */ lockdep_assert_held(&wq_pool_mutex); if (attrs->affn_scope == WQ_AFFN_DFL) scope = wq_affn_dfl; else scope = attrs->affn_scope; pt = &wq_pod_types[scope]; if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && likely(pt->nr_pods)) return pt; /* * Before workqueue_init_topology(), only SYSTEM is available which is * initialized in workqueue_init_early(). */ pt = &wq_pod_types[WQ_AFFN_SYSTEM]; BUG_ON(!pt->nr_pods); return pt; } /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. * * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called * on @pool safely to release it. */ static int init_worker_pool(struct worker_pool *pool) { raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); INIT_WORK(&pool->idle_cull_work, idle_cull_fn); timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; wqattrs_clear_for_pool(pool->attrs); return 0; } #ifdef CONFIG_LOCKDEP static void wq_init_lockdep(struct workqueue_struct *wq) { char *lock_name; lockdep_register_key(&wq->key); lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); if (!lock_name) lock_name = wq->name; wq->lock_name = lock_name; wq->lockdep_map = &wq->__lockdep_map; lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0); } static void wq_unregister_lockdep(struct workqueue_struct *wq) { if (wq->lockdep_map != &wq->__lockdep_map) return; lockdep_unregister_key(&wq->key); } static void wq_free_lockdep(struct workqueue_struct *wq) { if (wq->lockdep_map != &wq->__lockdep_map) return; if (wq->lock_name != wq->name) kfree(wq->lock_name); } #else static void wq_init_lockdep(struct workqueue_struct *wq) { } static void wq_unregister_lockdep(struct workqueue_struct *wq) { } static void wq_free_lockdep(struct workqueue_struct *wq) { } #endif static void free_node_nr_active(struct wq_node_nr_active **nna_ar) { int node; for_each_node(node) { kfree(nna_ar[node]); nna_ar[node] = NULL; } kfree(nna_ar[nr_node_ids]); nna_ar[nr_node_ids] = NULL; } static void init_node_nr_active(struct wq_node_nr_active *nna) { nna->max = WQ_DFL_MIN_ACTIVE; atomic_set(&nna->nr, 0); raw_spin_lock_init(&nna->lock); INIT_LIST_HEAD(&nna->pending_pwqs); } /* * Each node's nr_active counter will be accessed mostly from its own node and * should be allocated in the node. */ static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar) { struct wq_node_nr_active *nna; int node; for_each_node(node) { nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node); if (!nna) goto err_free; init_node_nr_active(nna); nna_ar[node] = nna; } /* [nr_node_ids] is used as the fallback */ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE); if (!nna) goto err_free; init_node_nr_active(nna); nna_ar[nr_node_ids] = nna; return 0; err_free: free_node_nr_active(nna_ar); return -ENOMEM; } static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); if (wq->flags & WQ_UNBOUND) free_node_nr_active(wq->node_nr_active); wq_free_lockdep(wq); free_percpu(wq->cpu_pwq); free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). * * Should be called with wq_pool_mutex held. */ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); if (--pool->refcnt) return; /* sanity checks */ if (WARN_ON(!(pool->cpu < 0)) || WARN_ON(!list_empty(&pool->worklist))) return; /* release id and unhash */ if (pool->id >= 0) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); /* * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. * * Having a concurrent manager is quite unlikely to happen as we can * only get here with * pwq->refcnt == pool->refcnt == 0 * which implies no work queued to the pool, which implies no worker can * become the manager. However a worker could have taken the role of * manager before the refcnts dropped to 0, since maybe_create_worker() * drops pool->lock */ while (true) { rcuwait_wait_event(&manager_wait, !(pool->flags & POOL_MANAGER_ACTIVE), TASK_UNINTERRUPTIBLE); mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); if (!(pool->flags & POOL_MANAGER_ACTIVE)) { pool->flags |= POOL_MANAGER_ACTIVE; break; } raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); } while ((worker = first_idle_worker(pool))) set_worker_dying(worker, &cull_list); WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); detach_dying_workers(&cull_list); mutex_unlock(&wq_pool_attach_mutex); reap_dying_workers(&cull_list); /* shut down the timers */ timer_delete_sync(&pool->idle_timer); cancel_work_sync(&pool->idle_cull_work); timer_delete_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); } /** * get_unbound_pool - get a worker_pool with the specified attributes * @attrs: the attributes of the worker_pool to get * * Obtain a worker_pool which has the same attributes as @attrs, bump the * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to * create a new one. * * Should be called with wq_pool_mutex held. * * Return: On success, a worker_pool with the same attributes as @attrs. * On failure, %NULL. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; int pod, node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; return pool; } } /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ for (pod = 0; pod < pt->nr_pods; pod++) { if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { node = pt->pod_node[pod]; break; } } /* nope, create a new one */ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); if (!pool || init_worker_pool(pool) < 0) goto fail; pool->node = node; copy_workqueue_attrs(pool->attrs, attrs); wqattrs_clear_for_pool(pool->attrs); if (worker_pool_assign_id(pool) < 0) goto fail; /* create and start the initial worker */ if (wq_online && !create_worker(pool)) goto fail; /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); return pool; fail: if (pool) put_unbound_pool(pool); return NULL; } /* * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero * refcnt and needs to be destroyed. */ static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last = false; /* * When @pwq is not linked, it doesn't hold any reference to the * @wq, and @wq is invalid to access. */ if (!list_empty(&pwq->pwqs_node)) { mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); /* * For ordered workqueue with a plugged dfl_pwq, restart it now. */ if (!is_last && (wq->flags & __WQ_ORDERED)) unplug_oldest_pwq(wq); mutex_unlock(&wq->mutex); } if (wq->flags & WQ_UNBOUND) { mutex_lock(&wq_pool_mutex); put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); } if (!list_empty(&pwq->pending_node)) { struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pwq->pool->node); raw_spin_lock_irq(&nna->lock); list_del_init(&pwq->pending_node); raw_spin_unlock_irq(&nna->lock); } kfree_rcu(pwq, rcu); /* * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ if (is_last) { wq_unregister_lockdep(wq); call_rcu(&wq->rcu, rcu_free_wq); } } /* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK); memset(pwq, 0, sizeof(*pwq)); pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pending_node); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ static void link_pwq(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq->mutex); /* may be called multiple times, ignore if already linked */ if (!list_empty(&pwq->pwqs_node)) return; /* set the matching work_color */ pwq->work_color = wq->work_color; /* link in @pwq */ list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct worker_pool *pool; struct pool_workqueue *pwq; lockdep_assert_held(&wq_pool_mutex); pool = get_unbound_pool(attrs); if (!pool) return NULL; pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!pwq) { put_unbound_pool(pool); return NULL; } init_pwq(pwq, wq, pool); return pwq; } static void apply_wqattrs_lock(void) { mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); } /** * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue * @cpu: the target CPU * * Calculate the cpumask a workqueue with @attrs should use on @pod. * The result is stored in @attrs->__pod_cpumask. * * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled * and @pod has online CPUs requested by @attrs, the returned cpumask is the * intersection of the possible CPUs of @pod and @attrs->cpumask. * * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int pod = pt->cpu_pod[cpu]; /* calculate possible CPUs in @pod that @attrs wants */ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); /* does @pod have any online CPUs @attrs wants? */ if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) { cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); return; } } /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, int cpu, struct pool_workqueue *pwq) { struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu); struct pool_workqueue *old_pwq; lockdep_assert_held(&wq_pool_mutex); lockdep_assert_held(&wq->mutex); /* link_pwq() can handle duplicate calls */ link_pwq(pwq); old_pwq = rcu_access_pointer(*slot); rcu_assign_pointer(*slot, pwq); return old_pwq; } /* context to store the prepared attrs & pwqs before applying */ struct apply_wqattrs_ctx { struct workqueue_struct *wq; /* target workqueue */ struct workqueue_attrs *attrs; /* attrs to apply */ struct list_head list; /* queued for batching commit */ struct pool_workqueue *dfl_pwq; struct pool_workqueue *pwq_tbl[]; }; /* free the resources after success or abort */ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { int cpu; for_each_possible_cpu(cpu) put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); kfree(ctx); } } /* allocate the attrs and pwqs for later installation */ static struct apply_wqattrs_ctx * apply_wqattrs_prepare(struct workqueue_struct *wq, const struct workqueue_attrs *attrs, const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs; int cpu; lockdep_assert_held(&wq_pool_mutex); if (WARN_ON(attrs->affn_scope < 0 || attrs->affn_scope >= WQ_AFFN_NR_TYPES)) return ERR_PTR(-EINVAL); ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); if (!ctx || !new_attrs) goto out_free; /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ copy_workqueue_attrs(new_attrs, attrs); wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; for_each_possible_cpu(cpu) { if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { wq_calc_pod_cpumask(new_attrs, cpu); ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; } } /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; /* * For initialized ordered workqueues, there should only be one pwq * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution * of newly queued work items until execution of older work items in * the old pwq's have completed. */ if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)) ctx->dfl_pwq->plugged = true; ctx->wq = wq; return ctx; out_free: free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); return ERR_PTR(-ENOMEM); } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwqs and install the new ones */ for_each_possible_cpu(cpu) ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, ctx->pwq_tbl[cpu]); ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); /* update node_nr_active->max */ wq_update_node_max_active(ctx->wq, -1); /* rescuer needs to respect wq cpumask changes */ if (ctx->wq->rescuer) set_cpus_allowed_ptr(ctx->wq->rescuer->task, unbound_effective_cpumask(ctx->wq)); mutex_unlock(&ctx->wq->mutex); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); if (IS_ERR(ctx)) return PTR_ERR(ctx); /* the ctx has been prepared successfully, let's commit it */ apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); return 0; } /** * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that * work items are affine to the pod it was issued on. Older pwqs are released as * in-flight work items finish. Note that a work item which repeatedly requeues * itself back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; mutex_lock(&wq_pool_mutex); ret = apply_workqueue_attrs_locked(wq, attrs); mutex_unlock(&wq_pool_mutex); return ret; } /** * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug * @wq: the target workqueue * @cpu: the CPU to update the pwq slot for * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged. * * * If pod affinity can't be adjusted due to memory allocation failure, it falls * back to @wq->dfl_pwq which may not be optimal but is always correct. * * Note that when the last allowed CPU of a pod goes offline for a workqueue * with a cpumask spanning multiple pods, the workers which were already * executing the work items for the workqueue will lose their CPU affinity and * may execute on any CPU. This is similar to how per-cpu workqueues behave on * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's * responsibility to flush the work item from CPU_DOWN_PREPARE. */ static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu) { struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; lockdep_assert_held(&wq_pool_mutex); if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; /* * We don't wanna alloc/free wq_attrs for each wq for each CPU. * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ target_attrs = unbound_wq_update_pwq_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ wq_calc_pod_cpumask(target_attrs, cpu); if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs)) return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } /* Install the new pwq. */ mutex_lock(&wq->mutex); old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: mutex_lock(&wq->mutex); pwq = unbound_pwq(wq, -1); raw_spin_lock_irq(&pwq->pool->lock); get_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); old_pwq = install_unbound_pwq(wq, cpu, pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); } static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; lockdep_assert_held(&wq_pool_mutex); wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); if (!wq->cpu_pwq) goto enomem; if (!(wq->flags & WQ_UNBOUND)) { struct worker_pool __percpu *pools; if (wq->flags & WQ_BH) pools = bh_worker_pools; else pools = cpu_worker_pools; for_each_possible_cpu(cpu) { struct pool_workqueue **pwq_p; struct worker_pool *pool; pool = &(per_cpu_ptr(pools, cpu)[highpri]); pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!*pwq_p) goto enomem; init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; } if (wq->flags & __WQ_ORDERED) { struct pool_workqueue *dfl_pwq; ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ dfl_pwq = rcu_access_pointer(wq->dfl_pwq); WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node || wq->pwqs.prev != &dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); } else { ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } return ret; enomem: if (wq->cpu_pwq) { for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); if (pwq) kmem_cache_free(pwq_cache, pwq); } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; } return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { if (max_active < 1 || max_active > WQ_MAX_ACTIVE) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", max_active, name, 1, WQ_MAX_ACTIVE); return clamp_val(max_active, 1, WQ_MAX_ACTIVE); } /* * Workqueues which may be used during memory reclaim should have a rescuer * to guarantee forward progress. */ static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; char id_buf[WORKER_ID_LEN]; int ret; lockdep_assert_held(&wq_pool_mutex); if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; rescuer = alloc_worker(NUMA_NO_NODE); if (!rescuer) { pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", wq->name); return -ENOMEM; } rescuer->rescue_wq = wq; format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL); rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", wq->name, ERR_PTR(ret)); kfree(rescuer); return ret; } wq->rescuer = rescuer; if (wq->flags & WQ_UNBOUND) kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq)); else kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); return 0; } /** * wq_adjust_max_active - update a wq's max_active to the current setting * @wq: target workqueue * * If @wq isn't freezing, set @wq->max_active to the saved_max_active and * activate inactive work items accordingly. If @wq is freezing, clear * @wq->max_active to zero. */ static void wq_adjust_max_active(struct workqueue_struct *wq) { bool activated; int new_max, new_min; lockdep_assert_held(&wq->mutex); if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { new_max = 0; new_min = 0; } else { new_max = wq->saved_max_active; new_min = wq->saved_min_active; } if (wq->max_active == new_max && wq->min_active == new_min) return; /* * Update @wq->max/min_active and then kick inactive work items if more * active work items are allowed. This doesn't break work item ordering * because new work items are always queued behind existing inactive * work items if there are any. */ WRITE_ONCE(wq->max_active, new_max); WRITE_ONCE(wq->min_active, new_min); if (wq->flags & WQ_UNBOUND) wq_update_node_max_active(wq, -1); if (new_max == 0) return; /* * Round-robin through pwq's activating the first inactive work item * until max_active is filled. */ do { struct pool_workqueue *pwq; activated = false; for_each_pwq(pwq, wq) { unsigned long irq_flags; /* can be called during early boot w/ irq disabled */ raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); if (pwq_activate_first_inactive(pwq, true)) { activated = true; kick_pool(pwq->pool); } raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); } } while (activated); } __printf(1, 0) static struct workqueue_struct *__alloc_workqueue(const char *fmt, unsigned int flags, int max_active, va_list args) { struct workqueue_struct *wq; size_t wq_size; int name_len; if (flags & WQ_BH) { if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS)) return NULL; if (WARN_ON_ONCE(max_active)) return NULL; } /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; /* allocate wq and format name */ if (flags & WQ_UNBOUND) wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1); else wq_size = sizeof(*wq); wq = kzalloc_noprof(wq_size, GFP_KERNEL); if (!wq) return NULL; if (flags & WQ_UNBOUND) { wq->unbound_attrs = alloc_workqueue_attrs_noprof(); if (!wq->unbound_attrs) goto err_free_wq; } name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); if (name_len >= WQ_NAME_LEN) pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); if (flags & WQ_BH) { /* * BH workqueues always share a single execution context per CPU * and don't impose any max_active limit. */ max_active = INT_MAX; } else { max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); } /* init wq */ wq->flags = flags; wq->max_active = max_active; wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); wq->saved_max_active = wq->max_active; wq->saved_min_active = wq->min_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); INIT_LIST_HEAD(&wq->list); if (flags & WQ_UNBOUND) { if (alloc_node_nr_active(wq->node_nr_active) < 0) goto err_free_wq; } /* * wq_pool_mutex protects the workqueues list, allocations of PWQs, * and the global freeze state. */ apply_wqattrs_lock(); if (alloc_and_link_pwqs(wq) < 0) goto err_unlock_free_node_nr_active; mutex_lock(&wq->mutex); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); list_add_tail_rcu(&wq->list, &workqueues); if (wq_online && init_rescuer(wq) < 0) goto err_unlock_destroy; apply_wqattrs_unlock(); if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; return wq; err_unlock_free_node_nr_active: apply_wqattrs_unlock(); /* * Failed alloc_and_link_pwqs() may leave pending pwq->release_work, * flushing the pwq_release_worker ensures that the pwq_release_workfn() * completes before calling kfree(wq). */ if (wq->flags & WQ_UNBOUND) { kthread_flush_worker(pwq_release_worker); free_node_nr_active(wq->node_nr_active); } err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; err_unlock_destroy: apply_wqattrs_unlock(); err_destroy: destroy_workqueue(wq); return NULL; } __printf(1, 4) struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...) { struct workqueue_struct *wq; va_list args; va_start(args, max_active); wq = __alloc_workqueue(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; wq_init_lockdep(wq); return wq; } EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) struct workqueue_struct * alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, struct lockdep_map *lockdep_map, ...) { struct workqueue_struct *wq; va_list args; va_start(args, lockdep_map); wq = __alloc_workqueue(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; wq->lockdep_map = lockdep_map; return wq; } EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map); #endif static bool pwq_busy(struct pool_workqueue *pwq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) if (pwq->nr_in_flight[i]) return true; if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1)) return true; if (!pwq_is_empty(pwq)) return true; return false; } /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. * * This function does NOT guarantee that non-pending work that has been * submitted with queue_delayed_work() and similar functions will be done * before destroying the workqueue. The fundamental problem is that, currently, * the workqueue has no way of accessing non-pending delayed_work. delayed_work * is only linked on the timer-side. All delayed_work must, therefore, be * canceled before calling this function. * * TODO: It would be better if the problem described above wouldn't exist and * destroy_workqueue() would cleanly cancel all pending and non-pending * delayed_work. */ void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; int cpu; /* * Remove it from sysfs first so that sanity check failure doesn't * lead to sysfs name conflicts. */ workqueue_sysfs_unregister(wq); /* mark the workqueue destruction is in progress */ mutex_lock(&wq->mutex); wq->flags |= __WQ_DESTROYING; mutex_unlock(&wq->mutex); /* drain it before proceeding with destruction */ drain_workqueue(wq); /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ if (wq->rescuer) { struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); kfree(rescuer); } /* * Sanity checks - grab all the locks so that we wait for all * in-flight operations which may do put_pwq(). */ mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_one_workqueue(wq); return; } raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); /* * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq * to put the base refs. @wq will be auto-destroyed from the last * pwq_put. RCU read lock prevents @wq from going away from under us. */ rcu_read_lock(); for_each_possible_cpu(cpu) { put_pwq_unlocked(unbound_pwq(wq, cpu)); RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL); } put_pwq_unlocked(unbound_pwq(wq, -1)); RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(destroy_workqueue); /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue * @max_active: new max_active value. * * Set max_active of @wq to @max_active. See the alloc_workqueue() function * comment. * * CONTEXT: * Don't call from IRQ context. */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { /* max_active doesn't mean anything for BH workqueues */ if (WARN_ON(wq->flags & WQ_BH)) return; /* disallow meddling with max_active for ordered workqueues */ if (WARN_ON(wq->flags & __WQ_ORDERED)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); wq->saved_max_active = max_active; if (wq->flags & WQ_UNBOUND) wq->saved_min_active = min(wq->saved_min_active, max_active); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** * workqueue_set_min_active - adjust min_active of an unbound workqueue * @wq: target unbound workqueue * @min_active: new min_active value * * Set min_active of an unbound workqueue. Unlike other types of workqueues, an * unbound workqueue is not guaranteed to be able to process max_active * interdependent work items. Instead, an unbound workqueue is guaranteed to be * able to process min_active number of interdependent work items which is * %WQ_DFL_MIN_ACTIVE by default. * * Use this function to adjust the min_active value between 0 and the current * max_active. */ void workqueue_set_min_active(struct workqueue_struct *wq, int min_active) { /* min_active is only meaningful for non-ordered unbound workqueues */ if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) != WQ_UNBOUND)) return; mutex_lock(&wq->mutex); wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } /** * current_work - retrieve %current task's work struct * * Determine if %current task is a workqueue worker and what it's working on. * Useful to find out the context that the %current task is running in. * * Return: work struct if %current task is a workqueue worker, %NULL otherwise. */ struct work_struct *current_work(void) { struct worker *worker = current_wq_worker(); return worker ? worker->current_work : NULL; } EXPORT_SYMBOL(current_work); /** * current_is_workqueue_rescuer - is %current workqueue rescuer? * * Determine whether %current is a workqueue rescuer. Can be used from * work functions to determine whether it's being run off the rescuer task. * * Return: %true if %current is a workqueue rescuer. %false otherwise. */ bool current_is_workqueue_rescuer(void) { struct worker *worker = current_wq_worker(); return worker && worker->rescue_wq; } /** * workqueue_congested - test whether a workqueue is congested * @cpu: CPU in question * @wq: target workqueue * * Test whether @wq's cpu workqueue for @cpu is congested. There is * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. * * With the exception of ordered workqueues, all workqueues have per-cpu * pool_workqueues, each with its own congested state. A workqueue being * congested on one CPU doesn't mean that the workqueue is contested on any * other CPUs. * * Return: * %true if congested, %false otherwise. */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool ret; rcu_read_lock(); preempt_disable(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); ret = !list_empty(&pwq->inactive_works); preempt_enable(); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); /** * work_busy - test whether a work is currently pending or running * @work: the work to be tested * * Test whether @work is currently pending or running. There is no * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * Return: * OR'd bitmask of WORK_BUSY_* bits. */ unsigned int work_busy(struct work_struct *work) { struct worker_pool *pool; unsigned long irq_flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; rcu_read_lock(); pool = get_work_pool(work); if (pool) { raw_spin_lock_irqsave(&pool->lock, irq_flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_busy); /** * set_worker_desc - set description for the current work item * @fmt: printf-style format string * @...: arguments for the format string * * This function can be called by a running work function to describe what * the work item is about. If the worker task gets dumped, this * information will be printed out together to help debugging. The * description can be at most WORKER_DESC_LEN including the trailing '\0'. */ void set_worker_desc(const char *fmt, ...) { struct worker *worker = current_wq_worker(); va_list args; if (worker) { va_start(args, fmt); vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); va_end(args); } } EXPORT_SYMBOL_GPL(set_worker_desc); /** * print_worker_info - print out worker information and description * @log_lvl: the log level to use when printing * @task: target task * * If @task is a worker and currently executing a work item, print out the * name of the workqueue being serviced and worker description set with * set_worker_desc() by the currently executing work item. * * This function can be safely called on any task as long as the * task_struct itself is accessible. While safe, this function isn't * synchronized and may print out mixups or garbages of limited length. */ void print_worker_info(const char *log_lvl, struct task_struct *task) { work_func_t *fn = NULL; char name[WQ_NAME_LEN] = { }; char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; struct workqueue_struct *wq = NULL; struct worker *worker; if (!(task->flags & PF_WQ_WORKER)) return; /* * This function is called without any synchronization and @task * could be in any state. Be careful with dereferences. */ worker = kthread_probe_data(task); /* * Carefully copy the associated workqueue's workfn, name and desc. * Keep the original last '\0' in case the original is garbage. */ copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); } } static void pr_cont_pool_info(struct worker_pool *pool) { pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); if (pool->node != NUMA_NO_NODE) pr_cont(" node=%d", pool->node); pr_cont(" flags=0x%x", pool->flags); if (pool->flags & POOL_BH) pr_cont(" bh%s", pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); else pr_cont(" nice=%d", pool->attrs->nice); } static void pr_cont_worker_id(struct worker *worker) { struct worker_pool *pool = worker->pool; if (pool->flags & WQ_BH) pr_cont("bh%s", pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); else pr_cont("%d%s", task_pid_nr(worker->task), worker->rescue_wq ? "(RESCUER)" : ""); } struct pr_cont_work_struct { bool comma; work_func_t func; long ctr; }; static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) { if (!pcwsp->ctr) goto out_record; if (func == pcwsp->func) { pcwsp->ctr++; return; } if (pcwsp->ctr == 1) pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); else pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); pcwsp->ctr = 0; out_record: if ((long)func == -1L) return; pcwsp->comma = comma; pcwsp->func = func; pcwsp->ctr = 1; } static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) { if (work->func == wq_barrier_func) { struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { if (!comma) pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont_work_flush(comma, work->func, pcwsp); } } static void show_pwq(struct pool_workqueue *pwq) { struct pr_cont_work_struct pcws = { .ctr = 0, }; struct worker_pool *pool = pwq->pool; struct work_struct *work; struct worker *worker; bool has_in_flight = false, has_pending = false; int bkt; pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" active=%d refcnt=%d%s\n", pwq->nr_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq == pwq) { has_in_flight = true; break; } } if (has_in_flight) { bool comma = false; pr_info(" in-flight:"); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq != pwq) continue; pr_cont(" %s", comma ? "," : ""); pr_cont_worker_id(worker); pr_cont(":%ps", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work, &pcws); pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); comma = true; } pr_cont("\n"); } list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) == pwq) { has_pending = true; break; } } if (has_pending) { bool comma = false; pr_info(" pending:"); list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) != pwq) continue; pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } if (!list_empty(&pwq->inactive_works)) { bool comma = false; pr_info(" inactive:"); list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } } /** * show_one_workqueue - dump state of specified workqueue * @wq: workqueue whose state will be printed */ void show_one_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool idle = true; unsigned long irq_flags; for_each_pwq(pwq, wq) { if (!pwq_is_empty(pwq)) { idle = false; break; } } if (idle) /* Nothing to print for idle workqueue */ return; pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); if (!pwq_is_empty(pwq)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks * also taken in their write paths. */ printk_deferred_enter(); show_pwq(pwq); printk_deferred_exit(); } raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering * hard lockup. */ touch_nmi_watchdog(); } } /** * show_one_worker_pool - dump state of specified worker pool * @pool: worker pool whose state will be printed */ static void show_one_worker_pool(struct worker_pool *pool) { struct worker *worker; bool first = true; unsigned long irq_flags; unsigned long hung = 0; raw_spin_lock_irqsave(&pool->lock, irq_flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; /* How long the first pending work is waiting for a worker. */ if (!list_empty(&pool->worklist)) hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; /* * Defer printing to avoid deadlocks in console drivers that * queue work while holding locks also taken in their write * paths. */ printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); list_for_each_entry(worker, &pool->idle_list, entry) { pr_cont(" %s", first ? "idle: " : ""); pr_cont_worker_id(worker); first = false; } pr_cont("\n"); printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, irq_flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering * hard lockup. */ touch_nmi_watchdog(); } /** * show_all_workqueues - dump workqueue state * * Called from a sysrq handler and prints out all busy workqueues and pools. */ void show_all_workqueues(void) { struct workqueue_struct *wq; struct worker_pool *pool; int pi; rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); list_for_each_entry_rcu(wq, &workqueues, list) show_one_workqueue(wq); for_each_pool(pool, pi) show_one_worker_pool(pool); rcu_read_unlock(); } /** * show_freezable_workqueues - dump freezable workqueue state * * Called from try_to_freeze_tasks() and prints out all freezable workqueues * still busy. */ void show_freezable_workqueues(void) { struct workqueue_struct *wq; rcu_read_lock(); pr_info("Showing freezable workqueues that are still busy:\n"); list_for_each_entry_rcu(wq, &workqueues, list) { if (!(wq->flags & WQ_FREEZABLE)) continue; show_one_workqueue(wq); } rcu_read_unlock(); } /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { /* stabilize PF_WQ_WORKER and worker pool association */ mutex_lock(&wq_pool_attach_mutex); if (task->flags & PF_WQ_WORKER) { struct worker *worker = kthread_data(task); struct worker_pool *pool = worker->pool; int off; off = format_worker_id(buf, size, worker, pool); if (pool) { raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If * current, prepend '+', otherwise '-'. */ if (worker->desc[0] != '\0') { if (worker->current_work) scnprintf(buf + off, size - off, "+%s", worker->desc); else scnprintf(buf + off, size - off, "-%s", worker->desc); } raw_spin_unlock_irq(&pool->lock); } } else { strscpy(buf, task->comm, size); } mutex_unlock(&wq_pool_attach_mutex); } #ifdef CONFIG_SMP /* * CPU hotplug. * * There are two challenges in supporting CPU hotplug. Firstly, there * are a lot of assumptions on strong associations among work, pwq and * pool which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, * worker pools serve mix of short, long and very long running works making * blocked draining impractical. * * This is solved by allowing the pools to be disassociated from the CPU * running as an unbound one and allowing it to be reattached later if the * cpu comes back online. */ static void unbind_workers(int cpu) { struct worker_pool *pool; struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * must be on the cpu. After this, they may become diasporas. * And the preemption disabled section in their sched callbacks * are guaranteed to see WORKER_UNBOUND since the code here * is on the same cpu. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; /* * The handling of nr_running in sched callbacks are disabled * now. Zap nr_running. After this, nr_running stays zero and * need_more_worker() and keep_working() are always true as * long as the worklist is not empty. This pool now behaves as * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ pool->nr_running = 0; /* * With concurrency management just turned off, a busy * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ kick_pool(pool); raw_spin_unlock_irq(&pool->lock); for_each_pool_worker(worker, pool) unbind_worker(worker); mutex_unlock(&wq_pool_attach_mutex); } } /** * rebind_workers - rebind all workers of a pool to the associated CPU * @pool: pool of interest * * @pool->cpu is coming online. Rebind all workers to the CPU. */ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; lockdep_assert_held(&wq_pool_attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should * be on the run-queue of the associated CPU before any local * wake-ups for concurrency management happen, restore CPU affinity * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)) < 0); } raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically * replace UNBOUND with another NOT_RUNNING flag REBOUND. * @worker will clear REBOUND using worker_clr_flags() when * it initiates the next execution cycle thus restoring * concurrency management. Note that when or whether * @worker clears REBOUND doesn't affect correctness. * * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in * wq_worker_running(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency * management operations. */ WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); worker_flags |= WORKER_REBOUND; worker_flags &= ~WORKER_UNBOUND; WRITE_ONCE(worker->flags, worker_flags); } raw_spin_unlock_irq(&pool->lock); } /** * restore_unbound_workers_cpumask - restore cpumask of unbound workers * @pool: unbound pool of interest * @cpu: the CPU which is coming up * * An unbound pool may end up with a cpumask which doesn't have any online * CPUs. When a worker of such pool get scheduled, the scheduler resets * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any * online CPU before, cpus_allowed of all its workers should be restored. */ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; lockdep_assert_held(&wq_pool_attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } int workqueue_prepare_cpu(unsigned int cpu) { struct worker_pool *pool; for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; if (!create_worker(pool)) return -ENOMEM; } return 0; } int workqueue_online_cpu(unsigned int cpu) { struct worker_pool *pool; struct workqueue_struct *wq; int pi; mutex_lock(&wq_pool_mutex); cpumask_set_cpu(cpu, wq_online_cpumask); for_each_pool(pool, pi) { /* BH pools aren't affected by hotplug */ if (pool->flags & POOL_BH) continue; mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); mutex_unlock(&wq_pool_attach_mutex); } /* update pod affinity of unbound workqueues */ list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; if (attrs) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); return 0; } int workqueue_offline_cpu(unsigned int cpu) { struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ if (WARN_ON(cpu != smp_processor_id())) return -1; unbind_workers(cpu); /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); cpumask_clear_cpu(cpu, wq_online_cpumask); list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; if (attrs) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, cpu); mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); return 0; } struct work_for_cpu { struct work_struct work; long (*fn)(void *); void *arg; long ret; }; static void work_for_cpu_fn(struct work_struct *work) { struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); wfc->ret = wfc->fn(wfc->arg); } /** * work_on_cpu_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg * @key: The lock class key for lock debugging purposes * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER /** * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { struct workqueue_struct *wq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } mutex_unlock(&wq_pool_mutex); } /** * freeze_workqueues_busy - are freezable workqueues still busy? * * Check whether freezing is complete. This function must be called * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: * Grabs and releases wq_pool_mutex. * * Return: * %true if some freezable workqueues are still busy. %false if freezing * is complete. */ bool freeze_workqueues_busy(void) { bool busy = false; struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(!workqueue_freezing); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_FREEZABLE)) continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ rcu_read_lock(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; rcu_read_unlock(); goto out_unlock; } } rcu_read_unlock(); } out_unlock: mutex_unlock(&wq_pool_mutex); return busy; } /** * thaw_workqueues - thaw workqueues * * Thaw workqueues. Normal queueing is restored and all collected * frozen works are transferred to their respective pool worklists. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { struct workqueue_struct *wq; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } out_unlock: mutex_unlock(&wq_pool_mutex); } #endif /* CONFIG_FREEZER */ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) { LIST_HEAD(ctxs); int ret = 0; struct workqueue_struct *wq; struct apply_wqattrs_ctx *ctx, *n; lockdep_assert_held(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING)) continue; ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); break; } list_add_tail(&ctx->list, &ctxs); } list_for_each_entry_safe(ctx, n, &ctxs, list) { if (!ret) apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); } if (!ret) { mutex_lock(&wq_pool_attach_mutex); cpumask_copy(wq_unbound_cpumask, unbound_cpumask); mutex_unlock(&wq_pool_attach_mutex); } return ret; } /** * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask * * This function can be called from cpuset code to provide a set of isolated * CPUs that should be excluded from wq_unbound_cpumask. */ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) { cpumask_var_t cpumask; int ret = 0; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; mutex_lock(&wq_pool_mutex); /* * If the operation fails, it will fall back to * wq_requested_unbound_cpumask which is initially set to * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten * by any subsequent write to workqueue/cpumask sysfs file. */ if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) cpumask_copy(cpumask, wq_requested_unbound_cpumask); if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); /* Save the current isolated cpumask & export it via sysfs */ if (!ret) cpumask_copy(wq_isolated_cpumask, exclude_cpumask); mutex_unlock(&wq_pool_mutex); free_cpumask_var(cpumask); return ret; } static int parse_affn_scope(const char *val) { int i; for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) return i; } return -EINVAL; } static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) { struct workqueue_struct *wq; int affn, cpu; affn = parse_affn_scope(val); if (affn < 0) return affn; if (affn == WQ_AFFN_DFL) return -EINVAL; cpus_read_lock(); mutex_lock(&wq_pool_mutex); wq_affn_dfl = affn; list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) unbound_wq_update_pwq(wq, cpu); } mutex_unlock(&wq_pool_mutex); cpus_read_unlock(); return 0; } static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) { return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); } static const struct kernel_param_ops wq_affn_dfl_ops = { .set = wq_affn_dfl_set, .get = wq_affn_dfl_get, }; module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * * per_cpu RO bool : whether the workqueue is per-cpu or unbound * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; struct device dev; }; static struct workqueue_struct *dev_to_wq(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); return wq_dev->wq; } static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); } static DEVICE_ATTR_RO(per_cpu); static ssize_t max_active_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); } static ssize_t max_active_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); int val; if (sscanf(buf, "%d", &val) != 1 || val <= 0) return -EINVAL; workqueue_set_max_active(wq, val); return count; } static DEVICE_ATTR_RW(max_active); static struct attribute *wq_sysfs_attrs[] = { &dev_attr_per_cpu.attr, &dev_attr_max_active.attr, NULL, }; ATTRIBUTE_GROUPS(wq_sysfs); static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); mutex_unlock(&wq->mutex); return written; } /* prepare workqueue_attrs for sysfs store operations */ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) { struct workqueue_attrs *attrs; lockdep_assert_held(&wq_pool_mutex); attrs = alloc_workqueue_attrs(); if (!attrs) return NULL; copy_workqueue_attrs(attrs, wq->unbound_attrs); return attrs; } static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; if (sscanf(buf, "%d", &attrs->nice) == 1 && attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) ret = apply_workqueue_attrs_locked(wq, attrs); else ret = -EINVAL; out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(wq->unbound_attrs->cpumask)); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; ret = cpumask_parse(buf, attrs->cpumask); if (!ret) ret = apply_workqueue_attrs_locked(wq, attrs); out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_affn_scope_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", wq_affn_names[WQ_AFFN_DFL], wq_affn_names[wq_affn_dfl]); else written = scnprintf(buf, PAGE_SIZE, "%s\n", wq_affn_names[wq->unbound_attrs->affn_scope]); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_affn_scope_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int affn, ret = -ENOMEM; affn = parse_affn_scope(buf); if (affn < 0) return affn; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_scope = affn; ret = apply_workqueue_attrs_locked(wq, attrs); } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_affinity_strict_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->affn_strict); } static ssize_t wq_affinity_strict_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int v, ret = -ENOMEM; if (sscanf(buf, "%d", &v) != 1) return -EINVAL; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_strict = (bool)v; ret = apply_workqueue_attrs_locked(wq, attrs); } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; static const struct bus_type wq_subsys = { .name = "workqueue", .dev_groups = wq_sysfs_groups, }; /** * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask * @cpumask: the cpumask to set * * The low-level workqueues cpumask is a global cpumask that limits * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * * Return: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; /* * Not excluding isolated cpus on purpose. * If the user wishes to include them, we allow that. */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { ret = 0; apply_wqattrs_lock(); if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); if (!ret) cpumask_copy(wq_requested_unbound_cpumask, cpumask); apply_wqattrs_unlock(); } return ret; } static ssize_t __wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } static ssize_t cpumask_requested_show(struct device *dev, struct device_attribute *attr, char *buf) { return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); } static DEVICE_ATTR_RO(cpumask_requested); static ssize_t cpumask_isolated_show(struct device *dev, struct device_attribute *attr, char *buf) { return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); } static DEVICE_ATTR_RO(cpumask_isolated); static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); } static ssize_t cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; int ret; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; ret = cpumask_parse(buf, cpumask); if (!ret) ret = workqueue_set_unbound_cpumask(cpumask); free_cpumask_var(cpumask); return ret ? ret : count; } static DEVICE_ATTR_RW(cpumask); static struct attribute *wq_sysfs_cpumask_attrs[] = { &dev_attr_cpumask.attr, &dev_attr_cpumask_requested.attr, &dev_attr_cpumask_isolated.attr, NULL, }; ATTRIBUTE_GROUPS(wq_sysfs_cpumask); static int __init wq_sysfs_init(void) { return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups); } core_initcall(wq_sysfs_init); static void wq_device_release(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); kfree(wq_dev); } /** * workqueue_sysfs_register - make a workqueue visible in sysfs * @wq: the workqueue to register * * Expose @wq in sysfs under /sys/bus/workqueue/devices. * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set * which is the preferred method. * * Workqueue user should use this function directly iff it wants to apply * workqueue_attrs before making the workqueue visible in sysfs; otherwise, * apply_workqueue_attrs() may race against userland updating the * attributes. * * Return: 0 on success, -errno on failure. */ int workqueue_sysfs_register(struct workqueue_struct *wq) { struct wq_device *wq_dev; int ret; /* * Adjusting max_active breaks ordering guarantee. Disallow exposing * ordered workqueues. */ if (WARN_ON(wq->flags & __WQ_ORDERED)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); if (!wq_dev) return -ENOMEM; wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; wq_dev->dev.release = wq_device_release; dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until * everything is ready. */ dev_set_uevent_suppress(&wq_dev->dev, true); ret = device_register(&wq_dev->dev); if (ret) { put_device(&wq_dev->dev); wq->wq_dev = NULL; return ret; } if (wq->flags & WQ_UNBOUND) { struct device_attribute *attr; for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { ret = device_create_file(&wq_dev->dev, attr); if (ret) { device_unregister(&wq_dev->dev); wq->wq_dev = NULL; return ret; } } } dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } /** * workqueue_sysfs_unregister - undo workqueue_sysfs_register() * @wq: the workqueue to unregister * * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { struct wq_device *wq_dev = wq->wq_dev; if (!wq->wq_dev) return; wq->wq_dev = NULL; device_unregister(&wq_dev->dev); } #else /* CONFIG_SYSFS */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ /* * Workqueue watchdog. * * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal * flush dependency, a concurrency managed work item which stays RUNNING * indefinitely. Workqueue stalls can be very difficult to debug as the * usual warning mechanisms don't trigger and internal workqueue state is * largely opaque. * * Workqueue watchdog monitors all worker pools periodically and dumps * state if some pools failed to make forward progress for a while where * forward progress is defined as the first item on ->worklist changing. * * This mechanism is controlled through the kernel parameter * "workqueue.watchdog_thresh" which can be updated at runtime through the * corresponding sysfs parameter file. */ #ifdef CONFIG_WQ_WATCHDOG static unsigned long wq_watchdog_thresh = 30; static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; static unsigned int wq_panic_on_stall; module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. * Pending work items should be handled by another idle worker * in all other situations. */ static void show_cpu_pool_hog(struct worker_pool *pool) { struct worker *worker; unsigned long irq_flags; int bkt; raw_spin_lock_irqsave(&pool->lock, irq_flags); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (task_is_running(worker->task)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks * also taken in their write paths. */ printk_deferred_enter(); pr_info("pool %d:\n", pool->id); sched_show_task(worker->task); printk_deferred_exit(); } } raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } static void show_cpu_pools_hogs(void) { struct worker_pool *pool; int pi; pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); rcu_read_lock(); for_each_pool(pool, pi) { if (pool->cpu_stall) show_cpu_pool_hog(pool); } rcu_read_unlock(); } static void panic_on_wq_watchdog(void) { static unsigned int wq_stall; if (wq_panic_on_stall) { wq_stall++; BUG_ON(wq_stall >= wq_panic_on_stall); } } static void wq_watchdog_reset_touched(void) { int cpu; wq_watchdog_touched = jiffies; for_each_possible_cpu(cpu) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; } static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; int pi; if (!thresh) return; rcu_read_lock(); for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; pool->cpu_stall = false; if (list_empty(&pool->worklist)) continue; /* * If a virtual machine is stopped by the host it can look to * the watchdog like a stall. */ kvm_check_and_clear_guest_paused(); /* get the latest of pool and touched timestamps */ if (pool->cpu >= 0) touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); else touched = READ_ONCE(wq_watchdog_touched); pool_ts = READ_ONCE(pool->watchdog_ts); if (time_after(pool_ts, touched)) ts = pool_ts; else ts = touched; /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { pool->cpu_stall = true; cpu_pool_stall = true; } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", jiffies_to_msecs(now - pool_ts) / 1000); } } rcu_read_unlock(); if (lockup_detected) show_all_workqueues(); if (cpu_pool_stall) show_cpu_pools_hogs(); if (lockup_detected) panic_on_wq_watchdog(); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } notrace void wq_watchdog_touch(int cpu) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; unsigned long touch_ts = READ_ONCE(wq_watchdog_touched); unsigned long now = jiffies; if (cpu >= 0) per_cpu(wq_watchdog_touched_cpu, cpu) = now; else WARN_ONCE(1, "%s should be called with valid CPU", __func__); /* Don't unnecessarily store to global cacheline */ if (time_after(now, touch_ts + thresh / 4)) WRITE_ONCE(wq_watchdog_touched, jiffies); } static void wq_watchdog_set_thresh(unsigned long thresh) { wq_watchdog_thresh = 0; timer_delete_sync(&wq_watchdog_timer); if (thresh) { wq_watchdog_thresh = thresh; wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); } } static int wq_watchdog_param_set_thresh(const char *val, const struct kernel_param *kp) { unsigned long thresh; int ret; ret = kstrtoul(val, 0, &thresh); if (ret) return ret; if (system_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; return 0; } static const struct kernel_param_ops wq_watchdog_thresh_ops = { .set = wq_watchdog_param_set_thresh, .get = param_get_ulong, }; module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, 0644); static void wq_watchdog_init(void) { timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); wq_watchdog_set_thresh(wq_watchdog_thresh); } #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ static void bh_pool_kick_normal(struct irq_work *irq_work) { raise_softirq_irqoff(TASKLET_SOFTIRQ); } static void bh_pool_kick_highpri(struct irq_work *irq_work) { raise_softirq_irqoff(HI_SOFTIRQ); } static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) { if (!cpumask_intersects(wq_unbound_cpumask, mask)) { pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); return; } cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); } static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = nice; pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); mutex_unlock(&wq_pool_mutex); } /** * workqueue_init_early - early init for workqueue subsystem * * This is the first step of three-staged workqueue subsystem initialization and * invoked as soon as the bare basics - memory allocation, cpumasks and idr are * up. It sets up all the data structures and system workqueues and allows early * boot code to create workqueues and queue/cancel work items. Actual work item * execution starts only after kthreads can be created and scheduled right * before early initcalls. */ void __init workqueue_init_early(void) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal, bh_pool_kick_highpri }; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); cpumask_copy(wq_online_cpumask, cpu_online_mask); cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!unbound_wq_update_pwq_attrs_buf); /* * If nohz_full is enabled, set power efficient workqueue as unbound. * This allows workqueue items to be moved to HK CPUs. */ if (housekeeping_enabled(HK_TYPE_TICK)) wq_power_efficient = true; /* initialize WQ_AFFN_SYSTEM pods */ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); pt->nr_pods = 1; cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); pt->pod_node[0] = NUMA_NO_NODE; pt->cpu_pod[0] = 0; /* initialize BH and CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; i = 0; for_each_bh_worker_pool(pool, cpu) { init_cpu_worker_pool(pool, cpu, std_nice[i]); pool->flags |= POOL_BH; init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]); i++; } i = 0; for_each_cpu_worker_pool(pool, cpu) init_cpu_worker_pool(pool, cpu, std_nice[i++]); } /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. */ BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; attrs->ordered = true; ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); system_percpu_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", WQ_POWER_EFFICIENT, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", WQ_BH | WQ_HIGHPRI, 0); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || !system_bh_wq || !system_bh_highpri_wq); } static void __init wq_cpu_intensive_thresh_init(void) { unsigned long thresh; unsigned long bogo; pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release"); BUG_ON(IS_ERR(pwq_release_worker)); /* if the user set it to a specific value, keep it */ if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what * most consider human-perceivable. However, the kernel also runs on a * lot slower CPUs including microcontrollers where the threshold is way * too low. * * Let's scale up the threshold upto 1 second if BogoMips is below 4000. * This is by no means accurate but it doesn't have to be. The mechanism * is still useful even when the threshold is fully scaled up. Also, as * the reports would usually be applicable to everyone, some machines * operating on longer thresholds won't significantly diminish their * usefulness. */ thresh = 10 * USEC_PER_MSEC; /* see init/calibrate.c for lpj -> BogoMIPS calculation */ bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); if (bogo < 4000) thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", loops_per_jiffy, bogo, thresh); wq_cpu_intensive_thresh_us = thresh; } /** * workqueue_init - bring workqueue subsystem fully online * * This is the second step of three-staged workqueue subsystem initialization * and invoked as soon as kthreads can be created and scheduled. Workqueues have * been created and work items queued on them, but there are no kworkers * executing the work items yet. Populate the worker pools with the initial * workers and enable future kworker creations. */ void __init workqueue_init(void) { struct workqueue_struct *wq; struct worker_pool *pool; int cpu, bkt; wq_cpu_intensive_thresh_init(); mutex_lock(&wq_pool_mutex); /* * Per-cpu pools created earlier could be missing node hint. Fix them * up. Also, create a rescuer for workqueues that requested it. */ for_each_possible_cpu(cpu) { for_each_bh_worker_pool(pool, cpu) pool->node = cpu_to_node(cpu); for_each_cpu_worker_pool(pool, cpu) pool->node = cpu_to_node(cpu); } list_for_each_entry(wq, &workqueues, list) { WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); } mutex_unlock(&wq_pool_mutex); /* * Create the initial workers. A BH pool has one pseudo worker that * represents the shared BH execution context and thus doesn't get * affected by hotplug events. Create the BH pseudo workers for all * possible CPUs here. */ for_each_possible_cpu(cpu) for_each_bh_worker_pool(pool, cpu) BUG_ON(!create_worker(pool)); for_each_online_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; BUG_ON(!create_worker(pool)); } } hash_for_each(unbound_pool_hash, bkt, pool, hash_node) BUG_ON(!create_worker(pool)); wq_online = true; wq_watchdog_init(); } /* * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique * and consecutive pod ID. The rest of @pt is initialized accordingly. */ static void __init init_pod_type(struct wq_pod_type *pt, bool (*cpus_share_pod)(int, int)) { int cur, pre, cpu, pod; pt->nr_pods = 0; /* init @pt->cpu_pod[] according to @cpus_share_pod() */ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); BUG_ON(!pt->cpu_pod); for_each_possible_cpu(cur) { for_each_possible_cpu(pre) { if (pre >= cur) { pt->cpu_pod[cur] = pt->nr_pods++; break; } if (cpus_share_pod(cur, pre)) { pt->cpu_pod[cur] = pt->cpu_pod[pre]; break; } } } /* init the rest to match @pt->cpu_pod[] */ pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); BUG_ON(!pt->pod_cpus || !pt->pod_node); for (pod = 0; pod < pt->nr_pods; pod++) BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); for_each_possible_cpu(cpu) { cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); } } static bool __init cpus_dont_share(int cpu0, int cpu1) { return false; } static bool __init cpus_share_smt(int cpu0, int cpu1) { #ifdef CONFIG_SCHED_SMT return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); #else return false; #endif } static bool __init cpus_share_numa(int cpu0, int cpu1) { return cpu_to_node(cpu0) == cpu_to_node(cpu1); } /** * workqueue_init_topology - initialize CPU pods for unbound workqueues * * This is the third step of three-staged workqueue subsystem initialization and * invoked after SMP and topology information are fully initialized. It * initializes the unbound CPU pods accordingly. */ void __init workqueue_init_topology(void) { struct workqueue_struct *wq; int cpu; init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); wq_topo_initialized = true; mutex_lock(&wq_pool_mutex); /* * Workqueues allocated earlier would have all CPUs sharing the default * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue * and CPU combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) unbound_wq_update_pwq(wq, cpu); if (wq->flags & WQ_UNBOUND) { mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); } void __warn_flushing_systemwide_wq(void) { pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); dump_stack(); } EXPORT_SYMBOL(__warn_flushing_systemwide_wq); static int __init workqueue_unbound_cpus_setup(char *str) { if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { cpumask_clear(&wq_cmdline_cpumask); pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); } return 1; } __setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); |
| 25 17 17 25 25 25 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> #include <linux/filelock.h> #include <linux/jiffies.h> #include "super.h" #include "mds_client.h" #include "cache.h" #include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> /* * Capability management * * The Ceph metadata servers control client access to inode metadata * and file data by issuing capabilities, granting clients permission * to read and/or write both inode field and file data to OSDs * (storage nodes). Each capability consists of a set of bits * indicating which operations are allowed. * * If the client holds a *_SHARED cap, the client has a coherent value * that can be safely read from the cached inode. * * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the * client is allowed to change inode attributes (e.g., file size, * mtime), note its dirty state in the ceph_cap, and asynchronously * flush that metadata change to the MDS. * * In the event of a conflicting operation (perhaps by another * client), the MDS will revoke the conflicting client capabilities. * * In order for a client to cache an inode, it must hold a capability * with at least one MDS server. When inodes are released, release * notifications are batched and periodically sent en masse to the MDS * cluster to release server state. */ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid); /* * Generate readable cap strings for debugging output. */ #define MAX_CAP_STR 20 static char cap_str[MAX_CAP_STR][40]; static DEFINE_SPINLOCK(cap_str_lock); static int last_cap_str; static char *gcap_string(char *s, int c) { if (c & CEPH_CAP_GSHARED) *s++ = 's'; if (c & CEPH_CAP_GEXCL) *s++ = 'x'; if (c & CEPH_CAP_GCACHE) *s++ = 'c'; if (c & CEPH_CAP_GRD) *s++ = 'r'; if (c & CEPH_CAP_GWR) *s++ = 'w'; if (c & CEPH_CAP_GBUFFER) *s++ = 'b'; if (c & CEPH_CAP_GWREXTEND) *s++ = 'a'; if (c & CEPH_CAP_GLAZYIO) *s++ = 'l'; return s; } const char *ceph_cap_string(int caps) { int i; char *s; int c; spin_lock(&cap_str_lock); i = last_cap_str++; if (last_cap_str == MAX_CAP_STR) last_cap_str = 0; spin_unlock(&cap_str_lock); s = cap_str[i]; if (caps & CEPH_CAP_PIN) *s++ = 'p'; c = (caps >> CEPH_CAP_SAUTH) & 3; if (c) { *s++ = 'A'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SLINK) & 3; if (c) { *s++ = 'L'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SXATTR) & 3; if (c) { *s++ = 'X'; s = gcap_string(s, c); } c = caps >> CEPH_CAP_SFILE; if (c) { *s++ = 'F'; s = gcap_string(s, c); } if (s == cap_str[i]) *s++ = '-'; *s = 0; return cap_str[i]; } void ceph_caps_init(struct ceph_mds_client *mdsc) { INIT_LIST_HEAD(&mdsc->caps_list); spin_lock_init(&mdsc->caps_list_lock); } void ceph_caps_finalize(struct ceph_mds_client *mdsc) { struct ceph_cap *cap; spin_lock(&mdsc->caps_list_lock); while (!list_empty(&mdsc->caps_list)) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } mdsc->caps_total_count = 0; mdsc->caps_avail_count = 0; mdsc->caps_use_count = 0; mdsc->caps_reserve_count = 0; mdsc->caps_min_count = 0; spin_unlock(&mdsc->caps_list_lock); } void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_min_count = fsopt->max_readdir; if (mdsc->caps_min_count < 1024) mdsc->caps_min_count = 1024; mdsc->caps_use_max = fsopt->caps_max; if (mdsc->caps_use_max > 0 && mdsc->caps_use_max < mdsc->caps_min_count) mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) { struct ceph_cap *cap; int i; if (nr_caps) { BUG_ON(mdsc->caps_reserve_count < nr_caps); mdsc->caps_reserve_count -= nr_caps; if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count -= nr_caps; for (i = 0; i < nr_caps; i++) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } } else { mdsc->caps_avail_count += nr_caps; } doutc(mdsc->fsc->client, "caps %d = %d used + %d resv + %d avail\n", mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } } /* * Called under mdsc->mutex. */ int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { struct ceph_client *cl = mdsc->fsc->client; int i, j; struct ceph_cap *cap; int have; int alloc = 0; int max_caps; int err = 0; bool trimmed = false; struct ceph_mds_session *s; LIST_HEAD(newcaps); doutc(cl, "ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count >= need) have = need; else have = mdsc->caps_avail_count; mdsc->caps_avail_count -= have; mdsc->caps_reserve_count += have; BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; ) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { list_add(&cap->caps_item, &newcaps); alloc++; i++; continue; } if (!trimmed) { for (j = 0; j < mdsc->max_sessions; j++) { s = __ceph_lookup_mds_session(mdsc, j); if (!s) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); max_caps = s->s_nr_caps - (need - i); ceph_trim_caps(mdsc, s, max_caps); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } trimmed = true; spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { int more_have; if (mdsc->caps_avail_count >= need - i) more_have = need - i; else more_have = mdsc->caps_avail_count; i += more_have; have += more_have; mdsc->caps_avail_count -= more_have; mdsc->caps_reserve_count += more_have; } spin_unlock(&mdsc->caps_list_lock); continue; } pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need, have + alloc); err = -ENOMEM; break; } if (!err) { BUG_ON(have + alloc != need); ctx->count = need; ctx->used = 0; } spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; mdsc->caps_reserve_count += alloc; list_splice(&newcaps, &mdsc->caps_list); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); if (err) __ceph_unreserve_caps(mdsc, have + alloc); spin_unlock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); return err; } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; bool reclaim = false; if (!ctx->count) return; doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; if (mdsc->caps_use_max > 0 && mdsc->caps_use_count > mdsc->caps_use_max) reclaim = true; spin_unlock(&mdsc->caps_list_lock); if (reclaim) ceph_reclaim_caps_nr(mdsc, ctx->used); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; spin_unlock(&mdsc->caps_list_lock); } else { spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { BUG_ON(list_empty(&mdsc->caps_list)); mdsc->caps_avail_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } spin_unlock(&mdsc->caps_list_lock); } return cap; } spin_lock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); BUG_ON(ctx->count > mdsc->caps_reserve_count); BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); return cap; } void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { struct ceph_client *cl = mdsc->fsc->client; spin_lock(&mdsc->caps_list_lock); doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to * avoid lots of free/alloc churn. */ if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { mdsc->caps_avail_count++; list_add(&cap->caps_item, &mdsc->caps_list); } BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); } void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { struct ceph_mds_client *mdsc = fsc->mdsc; spin_lock(&mdsc->caps_list_lock); if (total) *total = mdsc->caps_total_count; if (avail) *avail = mdsc->caps_avail_count; if (used) *used = mdsc->caps_use_count; if (reserved) *reserved = mdsc->caps_reserve_count; if (min) *min = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } /* * Find ceph_cap for given mds, if any. * * Called with i_ceph_lock held. */ struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; while (n) { cap = rb_entry(n, struct ceph_cap, ci_node); if (mds < cap->mds) n = n->rb_left; else if (mds > cap->mds) n = n->rb_right; else return cap; } return NULL; } struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); spin_unlock(&ci->i_ceph_lock); return cap; } /* * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) { struct rb_node **p = &ci->i_caps.rb_node; struct rb_node *parent = NULL; struct ceph_cap *cap = NULL; while (*p) { parent = *p; cap = rb_entry(parent, struct ceph_cap, ci_node); if (new->mds < cap->mds) p = &(*p)->rb_left; else if (new->mds > cap->mds) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->ci_node, parent, p); rb_insert_color(&new->ci_node, &ci->i_caps); } /* * (re)set cap hold timeouts, which control the delayed release * of unused caps back to the MDS. Should be called on cap use. */ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode, ceph_vinop(inode), ci->i_hold_caps_max - jiffies); } /* * (Re)queue cap at the end of the delayed cap release list. * * If I_FLUSH is set, leave the inode at the front of the list. * * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n", inode, ceph_vinop(inode), ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); if (!list_empty(&ci->i_cap_delay_list)) { if (ci->i_ceph_flags & CEPH_I_FLUSH) goto no_change; list_del_init(&ci->i_cap_delay_list); } __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); } } /* * Queue an inode for immediate writeback. Mark inode with I_FLUSH, * indicating we should send a cap message to flush dirty metadata * asap, and move to the front of the delayed cap list. */ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* * Cancel delayed work on cap. * * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); list_del_init(&ci->i_cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* Common issue checks for add_cap, handle_cap_grant. */ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); unsigned had = __ceph_caps_issued(ci, NULL); lockdep_assert_held(&ci->i_ceph_lock); /* * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; } /* * If FILE_SHARED is newly issued, mark dir not complete. We don't * know what happened to this directory while we didn't have the cap. * If FILE_SHARED is being revoked, also mark dir not complete. It * stops on-going cached readdir. */ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); if (S_ISDIR(ci->netfs.inode.i_mode)) { doutc(cl, " marking %p NOT complete\n", inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); } } /** * change_auth_cap_ses - move inode to appropriate lists when auth caps change * @ci: inode to be moved * @session: new auth caps session */ void change_auth_cap_ses(struct ceph_inode_info *ci, struct ceph_mds_session *session) { lockdep_assert_held(&ci->i_ceph_lock); if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) return; spin_lock(&session->s_mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) list_move(&ci->i_dirty_item, &session->s_cap_dirty); if (!list_empty(&ci->i_flushing_item)) list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); spin_unlock(&session->s_mdsc->cap_dirty_lock); } /* * Add a capability under the given MDS session. * * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; u32 gen; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode, ceph_vinop(inode), session->s_mds, cap_id, ceph_cap_string(issued), seq); gen = atomic_read(&session->s_cap_gen); cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; *new_cap = NULL; cap->issued = 0; cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; atomic64_inc(&mdsc->metric.total_caps); spin_unlock(&session->s_cap_lock); } else { spin_lock(&session->s_cap_lock); list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export * message, but still haven't received the cap import message. * handle_cap_export() updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing * a message that was send before the cap import message. So * don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != cap_id); seq = cap->seq; mseq = cap->mseq; issued |= cap->issued; flags |= CEPH_CAP_FLAG_AUTH; } } if (!ci->i_snap_realm || ((flags & CEPH_CAP_FLAG_AUTH) && realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { /* * add this inode to the appropriate snap realm */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) ceph_change_snap_realm(inode, realm); else WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", __func__, realmino, ci->i_vino.ino, ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); /* * If we are issued caps we don't want, or the mds' wanted * value appears to be off, queue a check so we'll release * later and/or update the mds wanted value. */ actual_wanted = __ceph_caps_wanted(ci); if ((wanted & ~actual_wanted) || (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); __cap_delay_requeue(mdsc, ci); } if (flags & CEPH_CAP_FLAG_AUTH) { if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { if (ci->i_auth_cap && ci->i_auth_cap->session != cap->session) change_auth_cap_ses(ci, cap->session); ci->i_auth_cap = cap; cap->mds_wanted = wanted; } } else { WARN_ON(ci->i_auth_cap == cap); } doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), ceph_cap_string(issued|cap->issued), seq, mds); cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; wake_up_all(&ci->i_cap_wq); } /* * Return true if cap has not timed out and belongs to the current * generation of the MDS session (i.e. has not gone 'stale' due to * us losing touch with the mds). */ static int __cap_is_valid(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_client *cl = cap->session->s_mdsc->fsc->client; unsigned long ttl; u32 gen; gen = atomic_read(&cap->session->s_cap_gen); ttl = cap->session->s_cap_ttl; if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } return 1; } /* * Return set of valid cap bits issued to us. Note that caps time * out, and may be invalidated in bulk if the client session times out * and session->s_cap_gen is bumped. */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; if (implemented) *implemented = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; } /* * exclude caps issued by non-auth MDS, but are been revoking * by the auth MDS. The non-auth MDS should be revoking/exporting * these caps, but the message is delayed. */ if (ci->i_auth_cap) { cap = ci->i_auth_cap; have &= ~cap->implemented | cap->issued; } return have; } /* * Get cap bits issued by caps other than @ocap */ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) { int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap == ocap) continue; if (!__cap_is_valid(cap)) continue; have |= cap->issued; } return have; } /* * Move a cap to the end of the LRU (oldest caps at list head, newest * at list tail). */ static void __touch_cap(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_mds_session *s = cap->session; struct ceph_client *cl = s->s_mdsc->fsc->client; spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode, ceph_vinop(inode), cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n", inode, ceph_vinop(inode), cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } /* * Check if we hold the given mask. If so, move the cap(s) to the * front of their respective LRUs. (This is the preferred way for * callers to check for caps they want.) */ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct rb_node *p; int have = ci->i_snap_caps; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) __touch_cap(cap); return 1; } /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { struct rb_node *q; /* touch this + preceding caps */ __touch_cap(cap); for (q = rb_first(&ci->i_caps); q != p; q = rb_next(q)) { cap = rb_entry(q, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if (cap->issued & mask) __touch_cap(cap); } } return 1; } } return 0; } int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); if (r) ceph_update_cap_hit(&fsc->mdsc->metric); else ceph_update_cap_mis(&fsc->mdsc->metric); return r; } /* * Return true if mask caps are currently being revoked by an MDS. */ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask) { struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } return 0; } int __ceph_caps_used(struct ceph_inode_info *ci) { int used = 0; if (ci->i_pin_ref) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || (S_ISREG(ci->netfs.inode.i_mode) && ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; if (ci->i_fx_ref) used |= CEPH_CAP_FILE_EXCL; return used; } #define FMODE_WAIT_BIAS 1000 /* * wanted, by virtue of open file modes */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ if (ci->i_nr_by_mode[RD_SHIFT] > 0 || time_after(ci->i_last_rd, used_cutoff)) want |= CEPH_CAP_ANY_SHARED; if (ci->i_nr_by_mode[WR_SHIFT] > 0 || time_after(ci->i_last_wr, used_cutoff)) { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) want |= CEPH_CAP_ANY_DIR_OPS; } if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) want |= CEPH_CAP_PIN; return want; } else { int bits = 0; if (ci->i_nr_by_mode[RD_SHIFT] > 0) { if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_rd, used_cutoff)) bits |= 1 << RD_SHIFT; } else if (time_after(ci->i_last_rd, idle_cutoff)) { bits |= 1 << RD_SHIFT; } if (ci->i_nr_by_mode[WR_SHIFT] > 0) { if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_wr, used_cutoff)) bits |= 1 << WR_SHIFT; } else if (time_after(ci->i_last_wr, idle_cutoff)) { bits |= 1 << WR_SHIFT; } /* check lazyio only when read/write is wanted */ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && ci->i_nr_by_mode[LAZY_SHIFT] > 0) bits |= 1 << LAZY_SHIFT; return bits ? ceph_caps_for_mode(bits >> 1) : 0; } } /* * wanted, by virtue of open file modes AND cap refs (buffered/cached data) */ int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; } else { /* we want EXCL if dirty data */ if (w & CEPH_CAP_FILE_BUFFER) w |= CEPH_CAP_FILE_EXCL; } return w; } /* * Return caps we have registered with the MDS(s) as 'wanted'. */ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; int mds_wanted = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (check && !__cap_is_valid(cap)) continue; if (cap == ci->i_auth_cap) mds_wanted |= cap->mds_wanted; else mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; } /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc; int removed = 0; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(cl, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode)); mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); if (ci->i_auth_cap == cap) ci->i_auth_cap = NULL; /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ doutc(cl, "delaying %p removal from session %p\n", cap, cap->session); } else { list_del_init(&cap->session_caps); session->s_nr_caps--; atomic64_dec(&mdsc->metric.total_caps); cap->session = NULL; removed = 1; } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; /* * s_cap_reconnect is protected by s_cap_lock. no one changes * s_cap_gen while session is in the reconnect state. */ if (queue_release && (!session->s_cap_reconnect || cap->cap_gen == atomic_read(&session->s_cap_gen))) { cap->queue_release = 1; if (removed) { __ceph_queue_cap_release(session, cap); removed = 0; } } else { cap->queue_release = 0; } cap->cap_ino = ci->i_vino.ino; spin_unlock(&session->s_cap_lock); if (removed) ceph_put_cap(mdsc, cap); if (!__ceph_is_any_real_caps(ci)) { /* when reconnect denied, we remove session caps forcibly, * i_wr_ref can be non-zero. If there are ongoing write, * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } } void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, bool queue_release) { struct ceph_inode_info *ci = cap->ci; struct ceph_fs_client *fsc; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(mdsc->fsc->client, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); fsc = ceph_inode_to_fs_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; u64 flush_tid, oldest_flush_tid, size, max_size; u64 xattr_version; u64 change_attr; struct ceph_buffer *xattr_buf; struct ceph_buffer *old_xattr_buf; struct timespec64 atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; u32 flags; kuid_t uid; kgid_t gid; umode_t mode; bool inline_data; bool wake; bool encrypted; u32 fscrypt_auth_len; u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context }; /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { struct ceph_mds_caps *fc; void *p; struct ceph_mds_client *mdsc = arg->session->s_mdsc; struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; doutc(mdsc->fsc->client, "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u" " tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), arg->cid, arg->ino, ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; memset(fc, 0, sizeof(*fc)); fc->cap_id = cpu_to_le64(arg->cid); fc->op = cpu_to_le32(arg->op); fc->seq = cpu_to_le32(arg->seq); fc->issue_seq = cpu_to_le32(arg->issue_seq); fc->migrate_seq = cpu_to_le32(arg->mseq); fc->caps = cpu_to_le32(arg->caps); fc->wanted = cpu_to_le32(arg->wanted); fc->dirty = cpu_to_le32(arg->dirty); fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (arg->encrypted) fc->size = cpu_to_le64(round_up(arg->size, CEPH_FSCRYPT_BLOCK_SIZE)); else #endif fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); ceph_encode_timespec64(&fc->ctime, &arg->ctime); fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); fc->mode = cpu_to_le32(arg->mode); fc->xattr_version = cpu_to_le64(arg->xattr_version); if (arg->xattr_buf) { msg->middle = ceph_buffer_get(arg->xattr_buf); fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); } p = fc + 1; /* flock buffer size (version 2) */ ceph_encode_32(&p, 0); /* inline version (version 4) */ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); /* * osd_epoch_barrier (version 5) * The epoch_barrier is protected osdc->lock, so READ_ONCE here in * case it was recently changed */ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); /* * caller_uid/caller_gid (version 7) * * Currently, we don't properly track which caller dirtied the caps * last, and force a flush of them when there is a conflict. For now, * just set this to 0:0, to emulate how the MDS has worked up to now. */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); /* pool namespace (version 8) (mds always ignores this) */ ceph_encode_32(&p, 0); /* btime and change_attr (version 9) */ ceph_encode_timespec64(p, &arg->btime); p += sizeof(struct ceph_timespec); ceph_encode_64(&p, arg->change_attr); /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); /* dirstats (version 11) - these are r/o on the client */ ceph_encode_64(&p, 0); ceph_encode_64(&p, 0); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) /* * fscrypt_auth and fscrypt_file (version 12) * * fscrypt_auth holds the crypto context (if any). fscrypt_file * tracks the real i_size as an __le64 field (and we use a rounded-up * i_size in the traditional size field). */ ceph_encode_32(&p, arg->fscrypt_auth_len); ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); ceph_encode_32(&p, sizeof(__le64)); ceph_encode_64(&p, arg->size); #else /* CONFIG_FS_ENCRYPTION */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); #endif /* CONFIG_FS_ENCRYPTION */ } /* * Queue cap releases when an inode is dropped from our cache. */ void __ceph_remove_caps(struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct rb_node *p; /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) * may call __ceph_caps_issued_mask() on a freeing inode. */ spin_lock(&ci->i_ceph_lock); p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); ceph_remove_cap(mdsc, cap, true); } spin_unlock(&ci->i_ceph_lock); } /* * Prepare to send a cap message to an MDS. Update the cap state, and populate * the arg struct with the parameters that will need to be sent. This should * be done under the i_ceph_lock to guard against changes to cap state. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. */ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int op, int flags, int used, int want, int retain, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n", inode, ceph_vinop(inode), cap, cap->session, ceph_cap_string(held), ceph_cap_string(held & retain), ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ /* * Wake up any waiters on wanted -> needed transition. This is due to * the weird transition from buffered to sync IO... we need to flush * dirty pages _before_ allowing sync writes to avoid reordering. */ arg->wake = cap->implemented & ~cap->issued; cap->implemented &= cap->issued | used; cap->mds_wanted = want; arg->session = cap->session; arg->ino = ceph_vino(inode).ino; arg->cid = cap->cap_id; arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) { if (want & CEPH_CAP_ANY_FILE_WR) ci->i_requested_max_size = arg->max_size; else ci->i_requested_max_size = 0; } if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; } arg->mtime = inode_get_mtime(inode); arg->atime = inode_get_atime(inode); arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); arg->op = op; arg->caps = cap->implemented; arg->wanted = want; arg->dirty = flushing; arg->seq = cap->seq; arg->issue_seq = cap->issue_seq; arg->mseq = cap->mseq; arg->time_warp_seq = ci->i_time_warp_seq; arg->uid = inode->i_uid; arg->gid = inode->i_gid; arg->mode = inode->i_mode; arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && !list_empty(&ci->i_cap_snaps)) { struct ceph_cap_snap *capsnap; list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->cap_flush.tid) break; if (capsnap->need_flush) { flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; break; } } } arg->flags = flags; arg->encrypted = IS_ENCRYPTED(inode); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len && WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { /* Don't set this if it's too big */ arg->fscrypt_auth_len = 0; } else { arg->fscrypt_auth_len = ci->fscrypt_auth_len; memcpy(arg->fscrypt_auth, ci->fscrypt_auth, min_t(size_t, ci->fscrypt_auth_len, sizeof(arg->fscrypt_auth))); } #endif /* CONFIG_FS_ENCRYPTION */ } #if IS_ENABLED(CONFIG_FS_ENCRYPTION) #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; } #else #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS; } #endif /* CONFIG_FS_ENCRYPTION */ /* * Send a cap msg on the given inode. * * Caller should hold snap_rwsem (read), s_mutex. */ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, false); if (!msg) { pr_err_client(cl, "error allocating cap msg: ino (%llx.%llx)" " flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); return; } encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); ceph_buffer_put(arg->xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } static inline int __send_flush_snap(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap_snap *capsnap, u32 mseq, u64 oldest_flush_tid) { struct cap_msg_args arg; struct ceph_msg *msg; arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; arg.follows = capsnap->follows; arg.flush_tid = capsnap->cap_flush.tid; arg.oldest_flush_tid = oldest_flush_tid; arg.size = capsnap->size; arg.max_size = 0; arg.xattr_version = capsnap->xattr_version; arg.xattr_buf = capsnap->xattr_blob; arg.old_xattr_buf = NULL; arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; arg.ctime = capsnap->ctime; arg.btime = capsnap->btime; arg.change_attr = capsnap->change_attr; arg.op = CEPH_CAP_OP_FLUSHSNAP; arg.caps = capsnap->issued; arg.wanted = 0; arg.dirty = capsnap->dirty; arg.seq = 0; arg.issue_seq = 0; arg.mseq = mseq; arg.time_warp_seq = capsnap->time_warp_seq; arg.uid = capsnap->uid; arg.gid = capsnap->gid; arg.mode = capsnap->mode; arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; arg.encrypted = IS_ENCRYPTED(inode); /* No fscrypt_auth changes from a capsnap.*/ arg.fscrypt_auth_len = 0; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), GFP_NOFS, false); if (!msg) return -ENOMEM; encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); return 0; } /* * When a snapshot is taken, clients accumulate dirty metadata on * inodes with capabilities in ceph_cap_snaps to describe the file * state at the time the snapshot was taken. This must be flushed * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * * Called under i_ceph_lock. */ static void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session *session) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; u64 first_tid = 1, last_tid = 0; doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode), session); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { /* * we need to wait for sync writes to complete and for dirty * pages to be written out. */ if (capsnap->dirty_pages || capsnap->writing) break; /* should be removed by ceph_try_drop_cap_snap() */ BUG_ON(!capsnap->need_flush); /* only flush each capsnap once */ if (capsnap->cap_flush.tid > 0) { doutc(cl, "already flushed %p, skipping\n", capsnap); continue; } spin_lock(&mdsc->cap_dirty_lock); capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; list_add_tail(&capsnap->cap_flush.g_list, &mdsc->cap_flush_list); if (oldest_flush_tid == 0) oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&capsnap->cap_flush.i_list, &ci->i_cap_flush_list); if (first_tid == 1) first_tid = capsnap->cap_flush.tid; last_tid = capsnap->cap_flush.tid; } ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n", inode, ceph_vinop(inode), cap, session->s_mds); break; } ret = -ENOENT; list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { if (iter->tid >= first_tid) { cf = iter; ret = 0; break; } } if (ret < 0) break; first_tid = cf->tid + 1; capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap, " "ino (%llx.%llx) tid %llu follows %llu\n", ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); spin_lock(&ci->i_ceph_lock); } } void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_mds_session *session = NULL; bool need_put = false; int mds; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (psession) session = *psession; retry: spin_lock(&ci->i_ceph_lock); if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { doutc(cl, " no capsnap needs flush, doing nothing\n"); goto out; } if (!ci->i_auth_cap) { doutc(cl, " no auth cap (migrating?), doing nothing\n"); goto out; } mds = ci->i_auth_cap->session->s_mds; if (session && session->s_mds != mds) { doutc(cl, " oops, wrong session %p mutex\n", session); ceph_put_mds_session(session); session = NULL; } if (!session) { spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); goto retry; } // make sure flushsnap messages are sent in proper order. if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) *psession = session; else ceph_put_mds_session(session); /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); if (!list_empty(&ci->i_snap_flush_item)) need_put = true; list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); if (need_put) iput(inode); } /* * Mark caps dirty. If inode is newly dirty, return the dirty flags. * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int was = ci->i_dirty_caps; int dirty = 0; lockdep_assert_held(&ci->i_ceph_lock); if (!ci->i_auth_cap) { pr_warn_client(cl, "%p %llx.%llx mask %s, " "but no auth cap (session was closed?)\n", inode, ceph_vinop(inode), ceph_cap_string(mask)); return 0; } doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode, ceph_vinop(inode), ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { struct ceph_mds_session *session = ci->i_auth_cap->session; WARN_ON_ONCE(ci->i_prealloc_cap_flush); swap(ci->i_prealloc_cap_flush, *pcf); if (!ci->i_head_snapc) { WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n", inode, ceph_vinop(inode), ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); dirty |= I_DIRTY_SYNC; } } else { WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; __cap_delay_requeue(mdsc, ci); return dirty; } struct ceph_cap_flush *ceph_alloc_cap_flush(void) { struct ceph_cap_flush *cf; cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); if (!cf) return NULL; cf->is_capsnap = false; return cf; } void ceph_free_cap_flush(struct ceph_cap_flush *cf) { if (cf) kmem_cache_free(ceph_cap_flush_cachep, cf); } static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) { if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_first_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); return cf->tid; } return 0; } /* * Remove cap_flush from the mdsc's or inode's flushing cap list. * Return true if caller needs to wake up flush waiters. */ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { prev = list_prev_entry(cf, g_list); prev->wake = true; wake = false; } list_del_init(&cf->g_list); return wake; } static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { prev = list_prev_entry(cf, i_list); prev->wake = true; wake = false; } list_del_init(&cf->i_list); return wake; } /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * * Called under i_ceph_lock. Returns the flush tid. */ static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap_flush *cf = NULL; int flushing; lockdep_assert_held(&ci->i_ceph_lock); BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; doutc(cl, "flushing %s, flushing_caps %s -> %s\n", ceph_cap_string(flushing), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps | flushing)); ci->i_flushing_caps |= flushing; ci->i_dirty_caps = 0; doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; cf->wake = wake; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); cf->tid = ++mdsc->last_cap_flush_tid; list_add_tail(&cf->g_list, &mdsc->cap_flush_list); *oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&cf->i_list, &ci->i_cap_flush_list); return cf->tid; } /* * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ doutc(cl, "%p %llx.%llx success\n", inode, ceph_vinop(inode)); /* save any racing async invalidate some trouble */ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode)); return -1; } bool __ceph_should_report_size(struct ceph_inode_info *ci) { loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; if (size >= ci->i_max_size) return true; /* half of previous max_size increment has been used */ if (ci->i_max_size > ci->i_reported_size && (size << 1) >= ci->i_max_size + ci->i_reported_size) return true; return false; } /* * Swiss army knife function to examine currently used and wanted * versus held caps. Release, flush, ack revoked caps to mds as * appropriate. * * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without * further delay. */ void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; bool queue_writeback = false; struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); return; } if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); /* Caps which have active references against them */ used = __ceph_caps_used(ci); /* * "issued" represents the current caps that the MDS wants us to have. * "implemented" is the set that we have been granted, and includes the * ones that have not yet been returned to the MDS (the "revoking" set, * usually because they have outstanding references). */ issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; want = file_wanted; /* The ones we currently want to retain (may be adjusted below) */ retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && __ceph_dir_is_complete(ci)) { /* * If a directory is complete, we want to keep * the exclusive cap. So that MDS does not end up * revoking the shared cap on every create/unlink * operation. */ if (IS_RDONLY(inode)) { want = CEPH_CAP_ANY_SHARED; } else { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; } retain |= want; } else { retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, * because then the mds would revoke it anyway to * journal max_size=0. */ if (ci->i_max_size == 0) retain |= CEPH_CAP_ANY_RD; } } doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " "flushing %s issued %s revoking %s retain %s %s%s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "", (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : ""); /* * If we no longer need to hold onto old our caps, and we may * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && S_ISREG(inode->i_mode) && !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { doutc(cl, "trying to invalidate on %p %llx.%llx\n", inode, ceph_vinop(inode)); if (try_nonblocking_invalidate(inode) < 0) { doutc(cl, "queuing invalidate\n"); queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } tried_invalidate = true; goto retry; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { int mflags = 0; struct cap_msg_args arg; cap = rb_entry(p, struct ceph_cap, ci_node); /* avoid looping forever */ if (mds >= cap->mds || ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) continue; /* * If we have an auth cap, we don't need to consider any * overlapping caps as used. */ cap_used = used; if (ci->i_auth_cap && cap != ci->i_auth_cap) cap_used &= ~ci->i_auth_cap->issued; revoking = cap->implemented & ~cap->issued; doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap_used), ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); /* completed revocation? going down and there are no caps? */ if (revoking) { if ((revoking & cap_used) == 0) { doutc(cl, "completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; } /* * If the "i_wrbuffer_ref" was increased by mmap or generic * cache write just before the ceph_check_caps() is called, * the Fb capability revoking will fail this time. Then we * must wait for the BDI's delayed work to flush the dirty * pages and to release the "i_wrbuffer_ref", which will cost * at most 5 seconds. That means the MDS needs to wait at * most 5 seconds to finished the Fb capability's revocation. * * Let's queue a writeback for it. */ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && (revoking & CEPH_CAP_FILE_BUFFER)) queue_writeback = true; } if (flags & CHECK_CAPS_FLUSH_FORCE) { doutc(cl, "force to flush caps\n"); goto ack; } if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ if (ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) { doutc(cl, "requesting new max_size\n"); goto ack; } /* approaching file_max? */ if (__ceph_should_report_size(ci)) { doutc(cl, "i_size approaching max_size\n"); goto ack; } } /* flush anything dirty? */ if (cap == ci->i_auth_cap) { if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { doutc(cl, "flushing dirty caps\n"); goto ack; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { doutc(cl, "flushing snap caps\n"); goto ack; } } /* want more caps from mds? */ if (want & ~cap->mds_wanted) { if (want & ~(cap->mds_wanted | cap->issued)) goto ack; if (!__cap_is_valid(cap)) goto ack; } /* things we might delay */ if ((cap->issued & ~retain) == 0) continue; /* nope, all good */ ack: ceph_put_mds_session(session); session = ceph_get_mds_session(cap->session); /* kick flushing and flush snaps before sending normal * cap message */ if (cap == ci->i_auth_cap && (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry; } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, false, &oldest_flush_tid); if (flags & CHECK_CAPS_FLUSH && list_empty(&session->s_cap_dirty)) mflags |= CEPH_CLIENT_CAPS_SYNC; } else { flushing = 0; flush_tid = 0; spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); } mds = cap->mds; /* remember mds, so we don't repeat */ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); spin_lock(&ci->i_ceph_lock); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* periodically re-calculate caps wanted by open files */ if (__ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list) && (file_wanted & ~CEPH_CAP_PIN) && !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { __cap_delay_requeue(mdsc, ci); } spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); if (queue_writeback) ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } /* * Try to flush dirty caps back to the auth mds. */ static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing = 0; u64 flush_tid = 0, oldest_flush_tid = 0; spin_lock(&ci->i_ceph_lock); retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; struct cap_msg_args arg; struct ceph_mds_session *session = cap->session; if (session->s_state < CEPH_MDS_SESSION_OPEN) { spin_unlock(&ci->i_ceph_lock); goto out; } if (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry_locked; } flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, true, &oldest_flush_tid); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = list_last_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); cf->wake = true; flush_tid = cf->tid; } flushing = ci->i_flushing_caps; spin_unlock(&ci->i_ceph_lock); } out: *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); int ret = 1; spin_lock(&ci->i_ceph_lock); if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush * cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); if (cf->tid <= flush_tid) ret = 0; } spin_unlock(&ci->i_ceph_lock); return ret; } /* * flush the mdlog and wait for any unsafe requests to complete. */ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { req1 = list_last_entry(&ci->i_unsafe_dirops, struct ceph_mds_request, r_unsafe_dir_item); ceph_mdsc_get_request(req1); } if (!list_empty(&ci->i_unsafe_iops)) { req2 = list_last_entry(&ci->i_unsafe_iops, struct ceph_mds_request, r_unsafe_target_item); ceph_mdsc_get_request(req2); } spin_unlock(&ci->i_unsafe_lock); /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ if (req1 || req2) { struct ceph_mds_request *req; struct ceph_mds_session **sessions; struct ceph_mds_session *s; unsigned int max_sessions; int i; mutex_lock(&mdsc->mutex); max_sessions = mdsc->max_sessions; sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { mutex_unlock(&mdsc->mutex); err = -ENOMEM; goto out; } spin_lock(&ci->i_unsafe_lock); if (req1) { list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } if (req2) { list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } spin_unlock(&ci->i_unsafe_lock); /* the auth MDS */ spin_lock(&ci->i_ceph_lock); if (ci->i_auth_cap) { s = ci->i_auth_cap->session; if (!sessions[s->s_mds]) sessions[s->s_mds] = ceph_get_mds_session(s); } spin_unlock(&ci->i_ceph_lock); mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDSes */ for (i = 0; i < max_sessions; i++) { s = sessions[i]; if (s) { send_flush_mdlog(s); ceph_put_mds_session(s); } } kfree(sessions); } doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode, ceph_vinop(inode), req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, ceph_timeout_jiffies(req1->r_timeout)); if (ret) err = -EIO; } if (req2) { ret = !wait_for_completion_timeout(&req2->r_safe_completion, ceph_timeout_jiffies(req2->r_timeout)); if (ret) err = -EIO; } out: if (req1) ceph_mdsc_put_request(req1); if (req2) ceph_mdsc_put_request(req2); return err; } int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int ret, err; int dirty; doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode), datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); if (datasync) goto out; ret = ceph_wait_on_async_create(inode); if (ret) goto out; dirty = try_flush_caps(inode, &flush_tid); doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty)); err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } if (err < 0) ret = err; err = file_check_and_advance_wb_err(file); if (err < 0) ret = err; out: doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode), datasync ? " datasync" : "", ret); return ret; } /* * Flush any dirty caps back to the mds. If we aren't asked to wait, * queue inode for flush but don't do so immediately, because we can * get by with fewer MDS messages if we wait for data writeback to * complete first. */ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int err = 0; int dirty; int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { err = ceph_wait_on_async_create(inode); if (err) return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } return err; } static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; u64 first_tid = 0; u64 last_snap_flush = 0; /* Don't do anything until create reply comes in */ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) return; ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { if (cf->is_capsnap) { last_snap_flush = cf->tid; break; } } list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { if (cf->tid < first_tid) continue; cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); break; } first_tid = cf->tid + 1; if (!cf->is_capsnap) { struct cap_msg_args arg; doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n", inode, ceph_vinop(inode), cap, cf->tid, ceph_cap_string(cf->caps)); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap," " %p %llx.%llx tid %llu follows %llu\n", inode, ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); } spin_lock(&ci->i_ceph_lock); } } void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } /* * if flushing caps were revoked, we re-send the cap flush * in client reconnect stage. This guarantees MDS * processes * the cap flush message before issuing the flushing caps to * other client. */ if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { /* encode_caps_cb() also will reset these sequence * numbers. make sure sequence numbers in cap flush * message match later reconnect message */ cap->seq = 0; cap->issue_seq = 0; cap->mseq = 0; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } else { ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; lockdep_assert_held(&session->s_mutex); doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap *cap = ci->i_auth_cap; struct inode *inode = &ci->netfs.inode; lockdep_assert_held(&ci->i_ceph_lock); doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n", inode, ceph_vinop(inode), ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { u64 oldest_flush_tid; spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } } /* * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. */ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, bool snap_rwsem_locked) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); lockdep_assert_held(&ci->i_ceph_lock); if (got & CEPH_CAP_PIN) ci->i_pin_ref++; if (got & CEPH_CAP_FILE_RD) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; if (got & CEPH_CAP_FILE_EXCL) ci->i_fx_ref++; if (got & CEPH_CAP_FILE_WR) { if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { BUG_ON(!snap_rwsem_locked); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } ci->i_wr_ref++; } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(inode); ci->i_wb_ref++; doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref); } } /* * Try to grab cap references. Specify those refs we @want, and the * minimal set we @need. Also include the larger offset we are writing * to (when applicable), and check against max_size here as well. * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. * * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, * or a negative error code. There are 3 special error codes: * -EAGAIN: need to sleep but non-blocking is specified * -EFBIG: ask caller to call check_max_size() and try again. * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. */ enum { /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ NON_BLOCKING = (1 << 8), CHECK_FILELOCK = (1 << 9), }; static int try_get_cap_refs(struct inode *inode, int need, int want, loff_t endoff, int flags, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; int have, implemented; bool snap_rwsem_locked = false; doutc(cl, "%p %llx.%llx need %s want %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(want)); again: spin_lock(&ci->i_ceph_lock); if ((flags & CHECK_FILELOCK) && (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { doutc(cl, "%p %llx.%llx error filelock\n", inode, ceph_vinop(inode)); ret = -EIO; goto out_unlock; } /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) { up_read(&mdsc->snap_rwsem); snap_rwsem_locked = false; } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } have = __ceph_caps_issued(ci, &implemented); if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n", inode, ceph_vinop(inode), endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we * can get a final snapshot value for size+mtime. */ if (__ceph_have_pending_cap_snap(ci)) { doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode, ceph_vinop(inode)); goto out_unlock; } } if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. * * For RDCACHE|RD -> RD, there is not need to wait and we can * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; int exclude = revoking & not; doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { if (!down_read_trylock(&mdsc->snap_rwsem)) { /* * we can not call down_read() when * task isn't in TASK_RUNNING state */ if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); snap_rwsem_locked = true; goto again; } snap_rwsem_locked = true; } if ((have & want) == want) *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); ret = 1; } } else { int session_readonly = false; int mds_wanted; if (ci->i_auth_cap && (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { struct ceph_mds_session *s = ci->i_auth_cap->session; spin_lock(&s->s_cap_lock); session_readonly = s->s_readonly; spin_unlock(&s->s_cap_lock); } if (session_readonly) { doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n", inode, ceph_vinop(inode), ceph_cap_string(need), ci->i_auth_cap->mds); ret = -EROFS; goto out_unlock; } if (ceph_inode_is_shutdown(inode)) { doutc(cl, "%p %llx.%llx inode is shutdown\n", inode, ceph_vinop(inode)); ret = -ESTALE; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); if (need & ~mds_wanted) { doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(mds_wanted)); ret = -EUCLEAN; goto out_unlock; } doutc(cl, "%p %llx.%llx have %s need %s\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: __ceph_touch_fmode(ci, mdsc, flags); spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); if (!ret) ceph_update_cap_mis(&mdsc->metric); else if (ret == 1) ceph_update_cap_hit(&mdsc->metric); doutc(cl, "%p %llx.%llx ret %d got %s\n", inode, ceph_vinop(inode), ret, ceph_cap_string(*got)); return ret; } /* * Check the offset we are writing up to against our current * max_size. If necessary, tell the MDS we want to write to * a larger offset. */ static void check_max_size(struct inode *inode, loff_t endoff) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int check = 0; /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n", inode, ceph_vinop(inode), endoff); ci->i_wanted_max_size = endoff; } /* duplicate ceph_check_caps()'s logic */ if (ci->i_auth_cap && (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) check = 1; spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } static inline int get_used_fmode(int caps) { int fmode = 0; if (caps & CEPH_CAP_FILE_RD) fmode |= CEPH_FILE_MODE_RD; if (caps & CEPH_CAP_FILE_WR) fmode |= CEPH_FILE_MODE_WR; return fmode; } int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret, flags; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_ANY_DIR_OPS)); if (need) { ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; } flags = get_used_fmode(need | want); if (nonblock) flags |= NON_BLOCKING; ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN) ret = 0; return ret; } /* * Wait for caps, and take cap references. If we can't get a WR cap * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, int want, loff_t endoff, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; flags = get_used_fmode(need | want); while (true) { flags &= CEPH_FILE_MODE_MASK; if (vfs_inode_has_locks(inode)) flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got); WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { #ifdef CONFIG_DEBUG_FS struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; #endif DEFINE_WAIT_FUNC(wait, woken_wake_function); #ifdef CONFIG_DEBUG_FS cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; cw.want = want; spin_lock(&mdsc->caps_list_lock); list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); #endif /* make sure used fmode not timeout */ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; while (!(ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&ci->i_cap_wq, &wait); ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); #ifdef CONFIG_DEBUG_FS spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); spin_unlock(&mdsc->caps_list_lock); #endif if (ret == -EAGAIN) continue; } if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); return -EBADF; } if (ret < 0) { if (ret == -EFBIG || ret == -EUCLEAN) { int ret2 = ceph_wait_on_async_create(inode); if (ret2 < 0) return ret2; } if (ret == -EFBIG) { check_max_size(inode, endoff); continue; } if (ret == -EUCLEAN) { /* session was killed, try renew caps */ ret = ceph_renew_caps(inode, flags); if (ret == 0) continue; } return ret; } if (S_ISREG(ci->netfs.inode.i_mode) && ceph_has_inline_data(ci) && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = find_get_page(inode->i_mapping, 0); if (page) { bool uptodate = PageUptodate(page); put_page(page); if (uptodate) break; } /* * drop cap refs first because getattr while * holding * caps refs can cause deadlock. */ ceph_put_cap_refs(ci, _got); _got = 0; /* * getattr request will bring inline data into * page cache */ ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) return ret; continue; } break; } *got = _got; return 0; } int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) { struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); return __ceph_get_caps(inode, fi, need, want, endoff, got); } /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); ceph_take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } /* * drop cap_snap that is not associated with any snapshot. * we don't need to send FLUSHSNAP message for it. */ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); if (!capsnap->need_flush && !capsnap->writing && !capsnap->dirty_pages) { doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows); BUG_ON(capsnap->cap_flush.tid > 0); ceph_put_snap_context(capsnap->context); if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; list_del(&capsnap->ci_item); ceph_put_cap_snap(capsnap); return 1; } return 0; } enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, PUT_CAP_REFS_ASYNC, }; /* * Release cap refs. * * If we released the last ref on any given cap, call ceph_check_caps * to release (or schedule a release). * * If we are releasing a WR cap (from a sync write), finalize any affected * cap_snap, and wake up any waiters. */ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) if (--ci->i_rd_ref == 0) last++; if (had & CEPH_CAP_FILE_CACHE) if (--ci->i_rdcache_ref == 0) last++; if (had & CEPH_CAP_FILE_EXCL) if (--ci->i_fx_ref == 0) last++; if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; /* put the ref held by ceph_take_cap_refs() */ put++; check_flushsnaps = true; } doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { /* * The Fb caps will always be took and released * together with the Fw caps. */ WARN_ON_ONCE(ci->i_wb_ref); last++; check_flushsnaps = true; if (ci->i_wrbuffer_ref_head == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) ceph_change_snap_realm(inode, NULL); } } if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); capsnap->writing = 0; if (ceph_try_drop_cap_snap(ci, capsnap)) /* put the ref held by ceph_queue_cap_snap() */ put++; else if (__ceph_finish_cap_snap(ci, capsnap)) flushsnaps = 1; wake = 1; } spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); switch (mode) { case PUT_CAP_REFS_SYNC: if (last) ceph_check_caps(ci, 0); else if (flushsnaps) ceph_flush_snaps(ci, NULL); break; case PUT_CAP_REFS_ASYNC: if (last) ceph_queue_check_caps(inode); else if (flushsnaps) ceph_queue_flush_snaps(inode); break; default: break; } if (wake) wake_up_all(&ci->i_cap_wq); while (put-- > 0) iput(inode); } void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC); } void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. * Once all dirty data for a cap_snap is flushed, flush snapped file * metadata back to the MDS. If we dropped the last ref, call * ceph_check_caps. */ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; bool flush_snaps = false; bool complete_capsnap = false; spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; if (ci->i_wrbuffer_ref == 0) { last = true; put++; } if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n", inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->context == snapc) { capsnap = iter; break; } } if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. */ WARN_ON_ONCE(ci->i_auth_cap); goto unlock; } capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = true; if (!capsnap->writing) { if (ceph_try_drop_cap_snap(ci, capsnap)) { put++; } else { ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; flush_snaps = true; } } } doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n", inode, ceph_vinop(inode), capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", complete_capsnap ? " (complete capsnap)" : ""); } unlock: spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, 0); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } if (complete_capsnap) wake_up_all(&ci->i_cap_wq); while (put-- > 0) { iput(inode); } } /* * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. */ static void invalidate_aliases(struct inode *inode) { struct ceph_client *cl = ceph_inode_to_client(inode); struct dentry *dn, *prev = NULL; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns * hashed dentry. After calling d_invalidate(), the * dentry becomes unhashed. * * For directory inode, d_find_alias() can return * unhashed dentry. But directory inode should have * one alias at most. */ while ((dn = d_find_alias(inode))) { if (dn == prev) { dput(dn); break; } d_invalidate(dn); if (prev) dput(prev); prev = dn; } if (prev) dput(prev); } struct cap_extra_info { struct ceph_string *pool_ns; /* inline data */ u64 inline_version; void *inline_data; u32 inline_len; /* dirstat */ bool dirstat_valid; u64 nfiles; u64 nsubdirs; u64 change_attr; /* currently issued */ int issued; struct timespec64 btime; u8 *fscrypt_auth; u32 fscrypt_auth_len; u64 fscrypt_file_size; }; /* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. */ static void handle_cap_grant(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_mds_caps *grant, struct ceph_buffer *xattr_buf, struct cap_extra_info *extra_info) __releases(ci->i_ceph_lock) __releases(session->s_mdsc->snap_rwsem) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); unsigned char check_caps = 0; bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); bool wake = false; bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; bool deleted_inode = false; bool fill_inline = false; bool revoke_wait = false; int flags = 0; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode, ceph_vinop(inode), cap, session->s_mds, seq, ceph_cap_string(newcaps)); doutc(cl, " size %llu max_size %llu, i_size %llu\n", size, max_size, i_size_read(inode)); /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } } } if (was_stale) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export message, * but still haven't received the cap import message. handle_cap_export * updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message * that was sent before the cap import message. So don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); seq = cap->seq; newcaps |= cap->issued; } /* side effects now are allowed */ cap->cap_gen = atomic_read(&session->s_cap_gen); cap->seq = seq; __check_cap_issue(ci, cap, newcaps); inode_set_max_iversion_raw(inode, extra_info->change_attr); if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { umode_t mode = le32_to_cpu(grant->mode); if (inode_wrong_type(inode, mode)) pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", ceph_vinop(inode), inode->i_mode, mode); else inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, ceph_vinop(inode), inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, ci->fscrypt_auth_len)) pr_warn_ratelimited_client(cl, "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", ci->fscrypt_auth_len, extra_info->fscrypt_auth_len); #endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0) deleted_inode = true; } if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); u64 version = le64_to_cpu(grant->xattr_version); if (version > ci->i_xattrs.version) { doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n", version, inode, ceph_vinop(inode), len); if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; ceph_forget_all_cached_acls(inode); ceph_security_invalidate_secctx(inode); } } if (newcaps & CEPH_CAP_ANY_RD) { struct timespec64 mtime, atime, ctime; /* ctime/mtime/atime? */ ceph_decode_timespec64(&mtime, &grant->mtime); ceph_decode_timespec64(&atime, &grant->atime); ceph_decode_timespec64(&ctime, &grant->ctime); ceph_fill_file_time(inode, extra_info->issued, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); } if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { ci->i_files = extra_info->nfiles; ci->i_subdirs = extra_info->nsubdirs; } if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, lockdep_is_held(&ci->i_ceph_lock)); rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); if (ci->i_layout.pool_id != old_pool || extra_info->pool_ns != old_ns) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; extra_info->pool_ns = old_ns; /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, extra_info->issued, le32_to_cpu(grant->truncate_seq), le64_to_cpu(grant->truncate_size), size); } if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { if (max_size != ci->i_max_size) { doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; } wake = true; } } /* check cap bits */ wanted = __ceph_caps_wanted(ci); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, " my wanted = %s, used = %s, dirty %s\n", ceph_cap_string(wanted), ceph_cap_string(used), ceph_cap_string(dirty)); if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && (wanted & ~(cap->mds_wanted | newcaps))) { /* * If mds is importing cap, prior cap messages that update * 'wanted' may get dropped by mds (migrate seq mismatch). * * We don't send cap message to update 'wanted' if what we * want are already issued. If mds revokes caps, cap message * that releases caps also tells mds what we want. But if * caps got revoked by mds forcedly (session stale). We may * haven't told mds what we want. */ check_caps = 1; } /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; doutc(cl, "revocation: %s -> %s (revoking %s)\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); if (S_ISREG(inode->i_mode) && (revoking & used & CEPH_CAP_FILE_BUFFER)) { writeback = true; /* initiate writeback; will delay ack */ revoke_wait = true; } else if (queue_invalidate && revoking == CEPH_CAP_FILE_CACHE && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) { revoke_wait = true; /* do nothing yet, invalidation will be queued */ } else if (cap == ci->i_auth_cap) { check_caps = 1; /* check auth cap only */ } else { check_caps = 2; /* check all caps */ } /* If there is new caps, try to wake up the waiters */ if (~cap->issued & newcaps) wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { doutc(cl, "caps unchanged: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); } else { doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); /* non-auth MDS is revoking the newly grant caps ? */ if (cap == ci->i_auth_cap && __ceph_caps_revoking_other(ci, cap, newcaps)) check_caps = 2; cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a * pending revocation */ wake = true; } BUG_ON(cap->issued & ~cap->implemented); /* don't let check_caps skip sending a response to MDS for revoke msgs */ if (!revoke_wait && le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { cap->mds_wanted = 0; flags |= CHECK_CAPS_FLUSH_FORCE; if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ } if (extra_info->inline_version > 0 && extra_info->inline_version >= ci->i_inline_version) { ci->i_inline_version = extra_info->inline_version; if (ci->i_inline_version != CEPH_INLINE_NONE && (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) fill_inline = true; } if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (ci->i_auth_cap == cap) { if (newcaps & ~extra_info->issued) wake = true; if (ci->i_requested_max_size > max_size || !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { /* re-request max_size if necessary */ ci->i_requested_max_size = 0; wake = true; } ceph_kick_flushing_inode_caps(session, ci); } up_read(&session->s_mdsc->snap_rwsem); } spin_unlock(&ci->i_ceph_lock); if (fill_inline) ceph_fill_inline_data(inode, NULL, extra_info->inline_data, extra_info->inline_len); if (queue_trunc) ceph_queue_vmtruncate(inode); if (writeback) /* * queue inode for writeback: we can't actually call * filemap_write_and_wait, etc. from message handler * context. */ ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); if (deleted_inode) invalidate_aliases(inode); if (wake) wake_up_all(&ci->i_cap_wq); mutex_unlock(&session->s_mutex); if (check_caps == 1) ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL); } /* * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the * MDS has been safely committed. */ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_flush *cf, *tmp_cf; LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; bool drop = false; bool wake_ci = false; bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) cleaned = cf->caps; /* Is this a capsnap? */ if (cf->is_capsnap) continue; if (cf->tid <= flush_tid) { /* * An earlier or current tid. The FLUSH_ACK should * represent a superset of this flush's caps. */ wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { /* * This is a later one. Any caps in it are still dirty * so don't count them as cleaned. */ cleaned &= ~cf->caps; if (!cleaned) break; } } doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n", inode, ceph_vinop(inode), session->s_mds, seq, ceph_cap_string(dirty), ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); list_for_each_entry(cf, &to_remove, i_list) wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); if (ci->i_flushing_caps == 0) { if (list_empty(&ci->i_cap_flush_list)) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) { struct inode *inode = &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, i_flushing_item)->netfs.inode; doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n", session->s_mds, inode, ceph_vinop(inode)); } } mdsc->num_cap_flushing--; doutc(cl, " %p %llx.%llx now !flushing\n", inode, ceph_vinop(inode)); if (ci->i_dirty_caps == 0) { doutc(cl, " %p %llx.%llx now clean\n", inode, ceph_vinop(inode)); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = true; if (ci->i_wr_ref == 0 && ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } } spin_unlock(&mdsc->cap_dirty_lock); out: spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); if (drop) iput(inode); } void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; bool ret; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap, inode, ceph_vinop(inode), ci); list_del_init(&capsnap->ci_item); ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); if (wake_ci) *wake_ci = ret; spin_lock(&mdsc->cap_dirty_lock); if (list_empty(&ci->i_cap_flush_list)) list_del_init(&ci->i_flushing_item); ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); if (wake_mdsc) *wake_mdsc = ret; spin_unlock(&mdsc->cap_dirty_lock); } void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); lockdep_assert_held(&ci->i_ceph_lock); WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); } /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. * * Caller hold s_mutex. */ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode, ceph_vinop(inode), ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->follows == follows) { if (iter->cap_flush.tid != flush_tid) { doutc(cl, " cap_snap %p follows %lld " "tid %lld != %lld\n", iter, follows, flush_tid, iter->cap_flush.tid); break; } capsnap = iter; break; } else { doutc(cl, " skipping cap_snap %p follows %lld\n", iter, iter->follows); } } if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); iput(inode); } } /* * Handle TRUNC from MDS, indicating file truncation. * * caller hold s_mutex. */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session, struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int mds = session->s_mds; int seq = le32_to_cpu(trunc->seq); u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); u64 truncate_size = le64_to_cpu(trunc->truncate_size); u64 size = le64_to_cpu(trunc->size); int implemented = 0; int dirty = __ceph_caps_dirty(ci); int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); bool queue_trunc = false; lockdep_assert_held(&ci->i_ceph_lock); issued |= implemented | dirty; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n", inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; } /* * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a * different one. If we are the most recent migration we've seen (as * indicated by mseq), make note of the migrating cap bits for the * duration (until we see the corresponding IMPORT). * * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *tsession = NULL; struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); u64 t_cap_id; u32 t_issue_seq, t_mseq; int target, issued; int mds = session->s_mds; if (ph) { t_cap_id = le64_to_cpu(ph->cap_id); t_issue_seq = le32_to_cpu(ph->issue_seq); t_mseq = le32_to_cpu(ph->mseq); target = le32_to_cpu(ph->mds); } else { t_cap_id = t_issue_seq = t_mseq = 0; target = -1; } doutc(cl, " cap %llx.%llx export to peer %d piseq %u pmseq %u\n", ceph_vinop(inode), target, t_issue_seq, t_mseq); retry: down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) goto out_unlock; if (target < 0) { ceph_remove_cap(mdsc, cap, false); goto out_unlock; } /* * now we know we haven't received the cap import message yet * because the exported cap still exist. */ issued = cap->issued; if (issued != cap->implemented) pr_err_ratelimited_client(cl, "issued != implemented: " "%p %llx.%llx mds%d seq %d mseq %d" " issued %s implemented %s\n", inode, ceph_vinop(inode), mds, cap->seq, cap->mseq, ceph_cap_string(issued), ceph_cap_string(cap->implemented)); tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) { doutc(cl, " updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; tcap->seq = t_issue_seq - 1; tcap->issue_seq = t_issue_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) { ci->i_auth_cap = tcap; change_auth_cap_ses(ci, tcap->session); } } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export target */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, issued, 0, t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &tcap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ tsession = ceph_mdsc_open_export_target_session(mdsc, target); if (!IS_ERR(tsession)) { if (mds > target) { mutex_lock(&session->s_mutex); mutex_lock_nested(&tsession->s_mutex, SINGLE_DEPTH_NESTING); } else { mutex_lock(&tsession->s_mutex); mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); tsession = NULL; target = -1; mutex_lock(&session->s_mutex); } goto retry; out_unlock: spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); ceph_put_mds_session(tsession); } if (new_cap) ceph_put_cap(mdsc, new_cap); } /* * Handle cap IMPORT. * * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, struct ceph_cap **target_cap, int *old_issued) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; int issued; unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); u64 p_cap_id; u32 piseq = 0; u32 pmseq = 0; int peer; if (ph) { p_cap_id = le64_to_cpu(ph->cap_id); peer = le32_to_cpu(ph->mds); piseq = le32_to_cpu(ph->issue_seq); pmseq = le32_to_cpu(ph->mseq); } else { p_cap_id = 0; peer = -1; } doutc(cl, " cap %llx.%llx import from peer %d piseq %u pmseq %u\n", ceph_vinop(inode), peer, piseq, pmseq); retry: cap = __get_cap_for_mds(ci, mds); if (!cap) { if (!new_cap) { spin_unlock(&ci->i_ceph_lock); new_cap = ceph_get_cap(mdsc, NULL); spin_lock(&ci->i_ceph_lock); goto retry; } cap = new_cap; } else { if (new_cap) { ceph_put_cap(mdsc, new_cap); new_cap = NULL; } } __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, &new_cap); ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; if (ocap && ocap->cap_id == p_cap_id) { doutc(cl, " remove export cap %p mds%d flags %d\n", ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && (ocap->seq != piseq || ocap->mseq != pmseq)) { pr_err_ratelimited_client(cl, "mismatched seq/mseq: " "%p %llx.%llx mds%d seq %d mseq %d" " importer mds%d has peer seq %d mseq %d\n", inode, ceph_vinop(inode), peer, ocap->seq, ocap->mseq, mds, piseq, pmseq); } ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; *target_cap = cap; } #ifdef CONFIG_FS_ENCRYPTION static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); if (extra->fscrypt_auth_len) { ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, GFP_KERNEL); if (!extra->fscrypt_auth) return -ENOMEM; ceph_decode_copy_safe(p, end, extra->fscrypt_auth, extra->fscrypt_auth_len, bad); } ceph_decode_32_safe(p, end, len, bad); if (len >= sizeof(u64)) { ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); len -= sizeof(u64); } ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #else static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; /* Don't care about these fields unless we're encryption-capable */ ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #endif /* * Handle a caps message from the MDS. * * Identify the appropriate session, inode, and call the right handler * based on the cap op. */ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; struct ceph_snap_realm *realm = NULL; int op; int msg_version = le16_to_cpu(msg->hdr.version); u32 seq, mseq, issue_seq; struct ceph_vino vino; void *snaptrace; size_t snaptrace_len; void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; bool close_sessions = false; bool do_cap_release = false; if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); issue_seq = le32_to_cpu(h->issue_seq); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); p = snaptrace + snaptrace_len; if (msg_version >= 2) { u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) goto bad; p += flock_len; } if (msg_version >= 3) { if (op == CEPH_CAP_OP_IMPORT) { if (p + sizeof(*peer) > end) goto bad; peer = p; p += sizeof(*peer); } else if (op == CEPH_CAP_OP_EXPORT) { /* recorded in unused fields */ peer = (void *)&h->size; } } if (msg_version >= 4) { ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); if (p + extra_info.inline_len > end) goto bad; extra_info.inline_data = p; p += extra_info.inline_len; } if (msg_version >= 5) { struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; u32 epoch_barrier; ceph_decode_32_safe(&p, end, epoch_barrier, bad); ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); } if (msg_version >= 8) { u32 pool_ns_len; /* version >= 6 */ ceph_decode_skip_64(&p, end, bad); // flush_tid /* version >= 7 */ ceph_decode_skip_32(&p, end, bad); // caller_uid ceph_decode_skip_32(&p, end, bad); // caller_gid /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); if (pool_ns_len > 0) { ceph_decode_need(&p, end, pool_ns_len, bad); extra_info.pool_ns = ceph_find_or_create_string(p, pool_ns_len); p += pool_ns_len; } } if (msg_version >= 9) { struct ceph_timespec *btime; if (p + sizeof(*btime) > end) goto bad; btime = p; ceph_decode_timespec64(&extra_info.btime, btime); p += sizeof(*btime); ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); } if (msg_version >= 11) { /* version >= 10 */ ceph_decode_skip_32(&p, end, bad); // flags /* version >= 11 */ extra_info.dirstat_valid = true; ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } if (msg_version >= 12) { if (parse_fscrypt_fields(&p, end, &extra_info)) goto bad; } /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, seq, issue_seq, mseq); mutex_lock(&session->s_mutex); if (!inode) { doutc(cl, " i don't have ino %llx\n", vino.ino); switch (op) { case CEPH_CAP_OP_IMPORT: case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { case CEPH_CAP_OP_FLUSHSNAP_ACK: handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), h, session); goto done; case CEPH_CAP_OP_EXPORT: handle_cap_export(inode, h, peer, session); goto done_unlocked; case CEPH_CAP_OP_IMPORT: realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); if (ceph_update_snap_trace(mdsc, snaptrace, snaptrace + snaptrace_len, false, &realm)) { up_write(&mdsc->snap_rwsem); close_sessions = true; goto done; } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); } spin_lock(&ci->i_ceph_lock); handle_cap_import(mdsc, inode, h, peer, session, &cap, &extra_info.issued); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; } /* the rest require a cap */ spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), session->s_mds); spin_unlock(&ci->i_ceph_lock); switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &extra_info.issued); extra_info.issued |= __ceph_caps_dirty(ci); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), h, session, cap); break; case CEPH_CAP_OP_TRUNC: queue_trunc = handle_cap_trunc(inode, h, session, &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); break; default: spin_unlock(&ci->i_ceph_lock); pr_err_client(cl, "unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } done: mutex_unlock(&session->s_mutex); done_unlocked: iput(inode); out: ceph_dec_mds_stopping_blocker(mdsc); ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); kfree(extra_info.fscrypt_auth); return; flush_cap_releases: /* * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ if (do_cap_release) { cap = ceph_get_cap(mdsc, NULL); cap->cap_ino = vino.ino; cap->queue_release = 1; cap->cap_id = le64_to_cpu(h->cap_id); cap->mseq = mseq; cap->seq = seq; cap->issue_seq = seq; spin_lock(&session->s_cap_lock); __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } ceph_flush_session_cap_releases(mdsc, session); goto done; bad: pr_err_client(cl, "corrupt message\n"); ceph_msg_dump(msg); goto out; } /* * Delayed work handler to process end of delayed cap release LRU list. * * If new caps are added to the list while processing it, these won't get * processed in this run. In this case, the ci->i_hold_caps_max will be * returned so that the work can be scheduled accordingly. */ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_mount_options *opt = mdsc->fsc->mount_options; unsigned long delay_max = opt->caps_wanted_delay_max * HZ; unsigned long loop_start = jiffies; unsigned long delay = 0; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { doutc(cl, "caps added recently. Exiting loop"); delay = ci->i_hold_caps_max; break; } if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && time_before(jiffies, ci->i_hold_caps_max)) break; list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } /* * Make sure too many dirty caps or general * slowness doesn't block mdsc delayed work, * preventing send_renew_caps() from running. */ if (time_after_eq(jiffies, loop_start + 5 * HZ)) break; } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); return delay; } /* * Flush all dirty caps to the mds */ static void flush_dirty_session_caps(struct ceph_mds_session *s) { struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_dirty_lock); while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); inode = &ci->netfs.inode; ihold(inode); doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); doutc(cl, "done\n"); } void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } /* * Flush all cap releases to the mds */ static void flush_cap_releases(struct ceph_mds_session *s) { struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); spin_lock(&s->s_cap_lock); if (s->s_num_cap_releases) ceph_flush_session_cap_releases(mdsc, s); spin_unlock(&s->s_cap_lock); doutc(cl, "done\n"); } void ceph_flush_cap_releases(struct ceph_mds_client *mdsc) { ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true); } void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { unsigned long now = jiffies; if (fmode & CEPH_FILE_MODE_RD) ci->i_last_rd = now; if (fmode & CEPH_FILE_MODE_WR) ci->i_last_wr = now; /* queue periodic check */ if (fmode && __ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list)) __cap_delay_requeue(mdsc, ci); } void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool already_opened = false; int i; if (count == 1) atomic64_inc(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { /* * If any of the mode ref is larger than 0, * that means it has been already opened by * others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) already_opened = true; if (bits & (1 << i)) ci->i_nr_by_mode[i] += count; } if (!already_opened) percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * Drop open file reference. If we were the last open file, * we may need to release capabilities to the MDS (or schedule * their delayed release). */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; if (count == 1) atomic64_dec(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { BUG_ON(ci->i_nr_by_mode[i] < count); ci->i_nr_by_mode[i] -= count; } /* * If any of the mode ref is not 0 after * decreased, that means it is still opened * by others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) is_closed = false; } if (is_closed) percpu_counter_dec(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * For a soon-to-be unlinked file, drop the LINK caps. If it * looks like the link count will hit 0, drop any other caps (other * than PIN) we don't specifically want (due to the file still being * open). */ int ceph_drop_caps_for_unlink(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; spin_lock(&ci->i_ceph_lock); if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_unlink_delay_list); spin_unlock(&mdsc->cap_delay_lock); /* * Fire the work immediately, because the MDS maybe * waiting for caps release. */ ceph_queue_cap_unlink_work(mdsc); } } spin_unlock(&ci->i_ceph_lock); return drop; } /* * Helpers for embedding cap and dentry lease releases into mds * requests. * * @force is used by dentry_release (below) to force inclusion of a * record for the directory inode, even when there aren't any caps to * drop. */ int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; int used, dirty; int ret = 0; spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n", inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), ceph_cap_string(unless)); /* only drop unused, clean caps */ drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { unless &= cap->issued; if (unless) { if (unless & CEPH_CAP_AUTH_EXCL) drop &= ~CEPH_CAP_AUTH_SHARED; if (unless & CEPH_CAP_LINK_EXCL) drop &= ~CEPH_CAP_LINK_SHARED; if (unless & CEPH_CAP_XATTR_EXCL) drop &= ~CEPH_CAP_XATTR_SHARED; if (unless & CEPH_CAP_FILE_EXCL) drop &= ~CEPH_CAP_FILE_SHARED; } if (force || (cap->issued & drop)) { if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); doutc(cl, "%p %llx.%llx cap %p %s -> %s, " "wanted %s -> %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(cap->issued & ~drop), ceph_cap_string(cap->mds_wanted), ceph_cap_string(wanted)); cap->issued &= ~drop; cap->implemented &= ~drop; cap->mds_wanted = wanted; if (cap == ci->i_auth_cap && !(wanted & CEPH_CAP_ANY_FILE_WR)) ci->i_requested_max_size = 0; } else { doutc(cl, "%p %llx.%llx cap %p %s (force)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } rel->ino = cpu_to_le64(ceph_ino(inode)); rel->cap_id = cpu_to_le64(cap->cap_id); rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq); rel->mseq = cpu_to_le32(cap->mseq); rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; *p += sizeof(*rel); ret = 1; } else { doutc(cl, "%p %llx.%llx cap %p %s (noop)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } } spin_unlock(&ci->i_ceph_lock); return ret; } /** * ceph_encode_dentry_release - encode a dentry release into an outgoing request * @p: outgoing request buffer * @dentry: dentry to release * @dir: dir to release it from * @mds: mds that we're speaking to * @drop: caps being dropped * @unless: unless we have these caps * * Encode a dentry release into an outgoing request buffer. Returns 1 if the * thing was released, or a negative error code otherwise. */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; /* This shouldn't happen */ BUG_ON(!dir); /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { int len = dentry->d_name.len; doutc(cl, "%p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); memcpy(*p, dentry->d_name.name, len); spin_unlock(&dentry->d_lock); if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { len = ceph_encode_encrypted_dname(dir, *p, len); if (len < 0) return len; } rel->dname_len = cpu_to_le32(len); *p += len; } else { spin_unlock(&dentry->d_lock); } return ret; } static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; int capsnap_release = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n", ci, inode, ceph_vinop(inode)); while (!list_empty(&ci->i_cap_snaps)) { capsnap = list_first_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); __ceph_remove_capsnap(inode, capsnap, NULL, NULL); ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); capsnap_release++; } wake_up_all(&ci->i_cap_wq); wake_up_all(&mdsc->cap_flushing_wq); return capsnap_release; } int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) { struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client *cl = fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); bool is_auth; bool dirty_dropped = false; int iputs = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n", cap, ci, inode, ceph_vinop(inode)); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); if (is_auth) { struct ceph_cap_flush *cf; if (ceph_inode_is_shutdown(inode)) { if (inode->i_data.nrpages > 0) *invalidate = true; if (ci->i_wrbuffer_ref > 0) mapping_set_error(&inode->i_data, -EIO); } spin_lock(&mdsc->cap_dirty_lock); /* trash all of the cap flushes for this inode */ while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); list_del_init(&cf->g_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited_client(cl, " dropping dirty %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_vinop(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited_client(cl, " dropping dirty+flushing %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_vinop(inode)); ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; pr_warn_ratelimited_client(cl, " dropping file locks for %p %llx.%llx\n", inode, ceph_vinop(inode)); } if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { cf = ci->i_prealloc_cap_flush; ci->i_prealloc_cap_flush = NULL; if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_cap_snaps)) iputs = remove_capsnaps(mdsc, inode); } if (dirty_dropped) ++iputs; return iputs; } |
| 72 107 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_SYNPROXY_H #define _NF_CONNTRACK_SYNPROXY_H #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netns/generic.h> struct nf_conn_synproxy { u32 isn; u32 its; u32 tsoff; }; static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY); #else return NULL; #endif } static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC); #else return NULL; #endif } static inline bool nf_ct_add_synproxy(struct nf_conn *ct, const struct nf_conn *tmpl) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) if (tmpl && nfct_synproxy(tmpl)) { if (!nfct_seqadj_ext_add(ct)) return false; if (!nfct_synproxy_ext_add(ct)) return false; } #endif return true; } #endif /* _NF_CONNTRACK_SYNPROXY_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_CRASH_DUMP_H #define LINUX_CRASH_DUMP_H #include <linux/kexec.h> #include <linux/proc_fs.h> #include <linux/elf.h> #include <linux/pgtable.h> #include <uapi/linux/vmcore.h> /* For IS_ENABLED(CONFIG_CRASH_DUMP) */ #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; extern unsigned long long dm_crypt_keys_addr; #ifdef CONFIG_CRASH_DUMP extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, unsigned long long paddr, unsigned long long size); extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); ssize_t copy_oldmem_page(struct iov_iter *i, unsigned long pfn, size_t csize, unsigned long offset); ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, size_t csize, unsigned long offset); void vmcore_cleanup(void); /* Architecture code defines this if there are other possible ELF * machine types, e.g. on bi-arch capable hardware. */ #ifndef vmcore_elf_check_arch_cross #define vmcore_elf_check_arch_cross(x) 0 #endif /* * Architecture code can redefine this if there are any special checks * needed for 32-bit ELF or 64-bit ELF vmcores. In case of 32-bit * only architecture, vmcore_elf64_check_arch can be set to zero. */ #ifndef vmcore_elf32_check_arch #define vmcore_elf32_check_arch(x) elf_check_arch(x) #endif #ifndef vmcore_elf64_check_arch #define vmcore_elf64_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x)) #endif #ifndef is_kdump_kernel /* * is_kdump_kernel() checks whether this kernel is booting after a panic of * previous kernel or not. This is determined by checking if previous kernel * has passed the elf core header address on command line. * * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will * return true if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic * of previous kernel. */ static inline bool is_kdump_kernel(void) { return elfcorehdr_addr != ELFCORE_ADDR_MAX; } #endif /* is_vmcore_usable() checks if the kernel is booting after a panic and * the vmcore region is usable. * * This makes use of the fact that due to alignment -2ULL is not * a valid pointer, much in the vain of IS_ERR(), except * dealing directly with an unsigned long long rather than a pointer. */ static inline int is_vmcore_usable(void) { return elfcorehdr_addr != ELFCORE_ADDR_ERR && elfcorehdr_addr != ELFCORE_ADDR_MAX ? 1 : 0; } /* vmcore_unusable() marks the vmcore as unusable, * without disturbing the logic of is_kdump_kernel() */ static inline void vmcore_unusable(void) { elfcorehdr_addr = ELFCORE_ADDR_ERR; } /** * struct vmcore_cb - driver callbacks for /proc/vmcore handling * @pfn_is_ram: check whether a PFN really is RAM and should be accessed when * reading the vmcore. Will return "true" if it is RAM or if the * callback cannot tell. If any callback returns "false", it's not * RAM and the page must not be accessed; zeroes should be * indicated in the vmcore instead. For example, a ballooned page * contains no data and reading from such a page will cause high * load in the hypervisor. * @get_device_ram: query RAM ranges that can only be detected by device * drivers, such as the virtio-mem driver, so they can be included in * the crash dump on architectures that allocate the elfcore hdr in the dump * ("2nd") kernel. Indicated RAM ranges may contain holes to reduce the * total number of ranges; such holes can be detected using the pfn_is_ram * callback just like for other RAM. * @next: List head to manage registered callbacks internally; initialized by * register_vmcore_cb(). * * vmcore callbacks allow drivers managing physical memory ranges to * coordinate with vmcore handling code, for example, to prevent accessing * physical memory ranges that should not be accessed when reading the vmcore, * although included in the vmcore header as memory ranges to dump. */ struct vmcore_cb { bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn); int (*get_device_ram)(struct vmcore_cb *cb, struct list_head *list); struct list_head next; }; extern void register_vmcore_cb(struct vmcore_cb *cb); extern void unregister_vmcore_cb(struct vmcore_cb *cb); struct vmcore_range { struct list_head list; unsigned long long paddr; unsigned long long size; loff_t offset; }; /* Allocate a vmcore range and add it to the list. */ static inline int vmcore_alloc_add_range(struct list_head *list, unsigned long long paddr, unsigned long long size) { struct vmcore_range *m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) return -ENOMEM; m->paddr = paddr; m->size = size; list_add_tail(&m->list, list); return 0; } /* Free a list of vmcore ranges. */ static inline void vmcore_free_ranges(struct list_head *list) { struct vmcore_range *m, *tmp; list_for_each_entry_safe(m, tmp, list, list) { list_del(&m->list); kfree(m); } } #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ /* Device Dump information to be filled by drivers */ struct vmcoredd_data { char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */ unsigned int size; /* Size of the dump */ /* Driver's registered callback to be invoked to collect dump */ int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf); }; #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP int vmcore_add_device_dump(struct vmcoredd_data *data); #else static inline int vmcore_add_device_dump(struct vmcoredd_data *data) { return -EOPNOTSUPP; } #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ #ifdef CONFIG_PROC_VMCORE ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, u64 *ppos, bool encrypted); #else static inline ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, u64 *ppos, bool encrypted) { return -EOPNOTSUPP; } #endif /* CONFIG_PROC_VMCORE */ #endif /* LINUX_CRASHDUMP_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SEGMENT_H #define _ASM_X86_SEGMENT_H #include <linux/const.h> #include <asm/alternative.h> #include <asm/ibt.h> /* * Constructor for a conventional segment GDT (or LDT) entry. * This is a macro so it can be used in initializers. */ #define GDT_ENTRY(flags, base, limit) \ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \ (((base) & _AC(0x00ffffff,ULL)) << 16) | \ (((limit) & _AC(0x0000ffff,ULL)))) /* Simple and small GDT entries for booting only: */ #define GDT_ENTRY_BOOT_CS 2 #define GDT_ENTRY_BOOT_DS 3 #define GDT_ENTRY_BOOT_TSS 4 #define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) #define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) /* * Bottom two bits of selector give the ring * privilege level */ #define SEGMENT_RPL_MASK 0x3 /* * When running on Xen PV, the actual privilege level of the kernel is 1, * not 0. Testing the Requested Privilege Level in a segment selector to * determine whether the context is user mode or kernel mode with * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level * matches the 0x3 mask. * * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV * kernels because privilege level 2 is never used. */ #define USER_SEGMENT_RPL_MASK 0x2 /* User mode is privilege level 3: */ #define USER_RPL 0x3 /* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ #define SEGMENT_TI_MASK 0x4 /* LDT segment has TI set ... */ #define SEGMENT_LDT 0x4 /* ... GDT has it cleared */ #define SEGMENT_GDT 0x0 #define GDT_ENTRY_INVALID_SEG 0 #if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64) /* * The layout of the per-CPU GDT under Linux: * * 0 - null <=== cacheline #1 * 1 - reserved * 2 - reserved * 3 - reserved * * 4 - unused <=== cacheline #2 * 5 - unused * * ------- start of TLS (Thread-Local Storage) segments: * * 6 - TLS segment #1 [ glibc's TLS segment ] * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] * 8 - TLS segment #3 <=== cacheline #3 * 9 - reserved * 10 - reserved * 11 - reserved * * ------- start of kernel segments: * * 12 - kernel code segment <=== cacheline #4 * 13 - kernel data segment * 14 - default user CS * 15 - default user DS * 16 - TSS <=== cacheline #5 * 17 - LDT * 18 - PNPBIOS support (16->32 gate) * 19 - PNPBIOS support * 20 - PNPBIOS support <=== cacheline #6 * 21 - PNPBIOS support * 22 - PNPBIOS support * 23 - APM BIOS support * 24 - APM BIOS support <=== cacheline #7 * 25 - APM BIOS support * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] * 28 - VDSO getcpu * 29 - unused * 30 - unused * 31 - TSS for double fault handler */ #define GDT_ENTRY_TLS_MIN 6 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) #define GDT_ENTRY_KERNEL_CS 12 #define GDT_ENTRY_KERNEL_DS 13 #define GDT_ENTRY_DEFAULT_USER_CS 14 #define GDT_ENTRY_DEFAULT_USER_DS 15 #define GDT_ENTRY_TSS 16 #define GDT_ENTRY_LDT 17 #define GDT_ENTRY_PNPBIOS_CS32 18 #define GDT_ENTRY_PNPBIOS_CS16 19 #define GDT_ENTRY_PNPBIOS_DS 20 #define GDT_ENTRY_PNPBIOS_TS1 21 #define GDT_ENTRY_PNPBIOS_TS2 22 #define GDT_ENTRY_APMBIOS_BASE 23 #define GDT_ENTRY_ESPFIX_SS 26 #define GDT_ENTRY_PERCPU 27 #define GDT_ENTRY_CPUNODE 28 #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* * Number of entries in the GDT table: */ #define GDT_ENTRIES 32 /* * Segment selector values corresponding to the above entries: */ #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __USER32_CS __USER_CS #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) /* segment for calling fn: */ #define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) /* code segment for BIOS: */ #define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) /* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) /* data segment for BIOS: */ #define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) /* transfer data segment: */ #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) /* another data segment: */ #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) #ifdef CONFIG_SMP # define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else # define __KERNEL_PERCPU 0 #endif #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #else /* 64-bit: */ #include <asm/cache.h> #define GDT_ENTRY_KERNEL32_CS 1 #define GDT_ENTRY_KERNEL_CS 2 #define GDT_ENTRY_KERNEL_DS 3 /* * We cannot use the same code segment descriptor for user and kernel mode, * not even in long flat mode, because of different DPL. * * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes * selectors: * * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, * * ss = STAR.SYSRET_CS+8 (in either case) * * thus USER_DS should be between 32-bit and 64-bit code selectors: */ #define GDT_ENTRY_DEFAULT_USER32_CS 4 #define GDT_ENTRY_DEFAULT_USER_DS 5 #define GDT_ENTRY_DEFAULT_USER_CS 6 /* Needs two entries */ #define GDT_ENTRY_TSS 8 /* Needs two entries */ #define GDT_ENTRY_LDT 10 #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 #define GDT_ENTRY_CPUNODE 15 /* * Number of entries in the GDT table: */ #define GDT_ENTRIES 16 /* * Segment selector values corresponding to the above entries: * * Note, selectors also need to have a correct RPL, * expressed with the +3 value for user-space selectors: */ #define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #endif #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack: */ #define EXCEPTION_ERRCODE_MASK 0x20027d00 #define GDT_SIZE (GDT_ENTRIES*8) #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) /* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ #define VDSO_CPUNODE_BITS 12 #define VDSO_CPUNODE_MASK 0xfff #ifndef __ASSEMBLER__ /* Helper functions to store/load CPU and node numbers */ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) { return (node << VDSO_CPUNODE_BITS) | cpu; } static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { unsigned int p; /* * Load CPU and node number from the GDT. LSL is faster than RDTSCP * and works on all CPUs. This is volatile so that it orders * correctly with respect to barrier() and to keep GCC from cleverly * hoisting it out of the calling function. * * If RDPID is available, use it. */ alternative_io ("lsl %[seg],%[p]", ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ X86_FEATURE_RDPID, [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) *cpu = (p & VDSO_CPUNODE_MASK); if (node) *node = (p >> VDSO_CPUNODE_BITS); } #endif /* !__ASSEMBLER__ */ #ifdef __KERNEL__ /* * early_idt_handler_array is an array of entry points referenced in the * early IDT. For simplicity, it's a real array with one entry point * every nine bytes. That leaves room for an optional 'push $0' if the * vector has no error code (two bytes), a 'push $vector_number' (two * bytes), and a jump to the common entry code (up to five bytes). */ #define EARLY_IDT_HANDLER_SIZE (9 + ENDBR_INSN_SIZE) /* * xen_early_idt_handler_array is for Xen pv guests: for each entry in * early_idt_handler_array it contains a prequel in the form of * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to * max 8 bytes. */ #define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE) #ifndef __ASSEMBLER__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void); #ifdef CONFIG_XEN_PV extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE]; #endif /* * Load a segment. Fall back on loading the zero segment if something goes * wrong. This variant assumes that loading zero fully clears the segment. * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any * failure to fully clear the cached descriptor is only observable for * FS and GS. */ #define __loadsegment_simple(seg, value) \ do { \ unsigned short __val = (value); \ \ asm volatile(" \n" \ "1: movl %k0,%%" #seg " \n" \ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\ : "+r" (__val) : : "memory"); \ } while (0) #define __loadsegment_ss(value) __loadsegment_simple(ss, (value)) #define __loadsegment_ds(value) __loadsegment_simple(ds, (value)) #define __loadsegment_es(value) __loadsegment_simple(es, (value)) #ifdef CONFIG_X86_32 /* * On 32-bit systems, the hidden parts of FS and GS are unobservable if * the selector is NULL, so there's no funny business here. */ #define __loadsegment_fs(value) __loadsegment_simple(fs, (value)) #define __loadsegment_gs(value) __loadsegment_simple(gs, (value)) #else static inline void __loadsegment_fs(unsigned short value) { asm volatile(" \n" "1: movw %0, %%fs \n" "2: \n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS) : : "rm" (value) : "memory"); } /* __loadsegment_gs is intentionally undefined. Use load_gs_index instead. */ #endif #define loadsegment(seg, value) __loadsegment_ ## seg (value) /* * Save a segment register away: */ #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") #endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */ |
| 35 34 12 11 39 10 7 41 36 12 12 36 35 36 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * RNG operations. * * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/rng.h> #include <linux/atomic.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" static DEFINE_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); static int crypto_default_rng_refcnt; int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { u8 *buf = NULL; int err; if (!seed && slen) { buf = kmalloc(slen, GFP_KERNEL); if (!buf) return -ENOMEM; err = get_random_bytes_wait(buf, slen); if (err) goto out; seed = buf; } err = crypto_rng_alg(tfm)->seed(tfm, seed, slen); out: kfree_sensitive(buf); return err; } EXPORT_SYMBOL_GPL(crypto_rng_reset); static int crypto_rng_init_tfm(struct crypto_tfm *tfm) { return 0; } static unsigned int seedsize(struct crypto_alg *alg) { struct rng_alg *ralg = container_of(alg, struct rng_alg, base); return ralg->seedsize; } static int __maybe_unused crypto_rng_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_rng rrng; memset(&rrng, 0, sizeof(rrng)); strscpy(rrng.type, "rng", sizeof(rrng.type)); rrng.seedsize = seedsize(alg); return nla_put(skb, CRYPTOCFGA_REPORT_RNG, sizeof(rrng), &rrng); } static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) { seq_printf(m, "type : rng\n"); seq_printf(m, "seedsize : %u\n", seedsize(alg)); } static const struct crypto_type crypto_rng_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_rng_init_tfm, #ifdef CONFIG_PROC_FS .show = crypto_rng_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_rng_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_RNG, .tfmsize = offsetof(struct crypto_rng, base), .algsize = offsetof(struct rng_alg, base), }; struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_rng_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_rng); int crypto_get_default_rng(void) { struct crypto_rng *rng; int err; mutex_lock(&crypto_default_rng_lock); if (!crypto_default_rng) { rng = crypto_alloc_rng("stdrng", 0, 0); err = PTR_ERR(rng); if (IS_ERR(rng)) goto unlock; err = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng)); if (err) { crypto_free_rng(rng); goto unlock; } crypto_default_rng = rng; } crypto_default_rng_refcnt++; err = 0; unlock: mutex_unlock(&crypto_default_rng_lock); return err; } EXPORT_SYMBOL_GPL(crypto_get_default_rng); void crypto_put_default_rng(void) { mutex_lock(&crypto_default_rng_lock); crypto_default_rng_refcnt--; mutex_unlock(&crypto_default_rng_lock); } EXPORT_SYMBOL_GPL(crypto_put_default_rng); #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE) int crypto_del_default_rng(void) { int err = -EBUSY; mutex_lock(&crypto_default_rng_lock); if (crypto_default_rng_refcnt) goto out; crypto_free_rng(crypto_default_rng); crypto_default_rng = NULL; err = 0; out: mutex_unlock(&crypto_default_rng_lock); return err; } EXPORT_SYMBOL_GPL(crypto_del_default_rng); #endif int crypto_register_rng(struct rng_alg *alg) { struct crypto_alg *base = &alg->base; if (alg->seedsize > PAGE_SIZE / 8) return -EINVAL; base->cra_type = &crypto_rng_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_RNG; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_rng); void crypto_unregister_rng(struct rng_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_rng); int crypto_register_rngs(struct rng_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_rng(algs + i); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_rng(algs + i); return ret; } EXPORT_SYMBOL_GPL(crypto_register_rngs); void crypto_unregister_rngs(struct rng_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_rng(algs + i); } EXPORT_SYMBOL_GPL(crypto_unregister_rngs); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Random Number Generator"); |
| 1201 112 397 1407 9348 10681 2494 10456 238 236 690 1186 6186 837 182 5968 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H #include <linux/capability.h> #include <linux/init.h> #include <linux/key.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/uidgid.h> #include <linux/sched.h> #include <linux/sched/user.h> struct cred; struct inode; /* * COW Supplementary groups list */ struct group_info { refcount_t usage; int ngroups; kgid_t gid[]; } __randomize_layout; /** * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. */ static inline struct group_info *get_group_info(struct group_info *gi) { refcount_inc(&gi->usage); return gi; } /** * put_group_info - Release a reference to a group info structure * @group_info: The group info to release */ #define put_group_info(group_info) \ do { \ if (refcount_dec_and_test(&(group_info)->usage)) \ groups_free(group_info); \ } while (0) #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); extern int groups_search(const struct group_info *, kgid_t); extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern bool may_setgroups(void); extern void groups_sort(struct group_info *); #else static inline void groups_free(struct group_info *group_info) { } static inline int in_group_p(kgid_t grp) { return 1; } static inline int in_egroup_p(kgid_t grp) { return 1; } static inline int groups_search(const struct group_info *group_info, kgid_t grp) { return 1; } #endif /* * The security context of a task * * The parts of the context break down into two categories: * * (1) The objective context of a task. These parts are used when some other * task is attempting to affect this one. * * (2) The subjective context. These details are used when the task is acting * upon another object, be that a file, a task, a key or whatever. * * Note that some members of this structure belong to both categories - the * LSM security pointer for instance. * * A task has two security pointers. task->real_cred points to the objective * context that defines that task's actual details. The objective part of this * context is used whenever that task is acted upon. * * task->cred points to the subjective context that defines the details of how * that task is going to act upon another object. This may be overridden * temporarily to point to another security context, but normally points to the * same context as task->real_cred. */ struct cred { atomic_long_t usage; kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct ucounts *ucounts; struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); extern int set_cred_ucounts(struct cred *); static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, cap_intersect(cred->cap_permitted, cred->cap_inheritable)); } static inline const struct cred *override_creds(const struct cred *override_cred) { return rcu_replace_pointer(current->cred, override_cred, 1); } static inline const struct cred *revert_creds(const struct cred *revert_cred) { return rcu_replace_pointer(current->cred, revert_cred, 1); } /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference * @nr: Number of references to acquire * * Get references on the specified set of credentials. The caller must release * all acquired reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. Although the * pointer is const, this will temporarily discard the const and increment the * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. */ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; nonconst_cred->non_rcu = 0; atomic_long_add(nr, &nonconst_cred->usage); return cred; } /* * get_cred - Get a reference on a set of credentials * @cred: The credentials to reference * * Get a reference on the specified set of credentials. The caller must * release the reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. */ static inline const struct cred *get_cred(const struct cred *cred) { return get_cred_many(cred, 1); } static inline const struct cred *get_cred_rcu(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; nonconst_cred->non_rcu = 0; return cred; } /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * @nr: Number of references to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. * * This takes a const pointer to a set of credentials because the credentials * on task_struct are attached by const pointers to prevent accidental * alteration of otherwise immutable credential sets. */ static inline void put_cred_many(const struct cred *_cred, int nr) { struct cred *cred = (struct cred *) _cred; if (cred) { if (atomic_long_sub_and_test(nr, &cred->usage)) __put_cred(cred); } } /* * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. */ static inline void put_cred(const struct cred *cred) { put_cred_many(cred, 1); } DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T)) /** * current_cred - Access the current task's subjective credentials * * Access the subjective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_cred() \ rcu_dereference_protected(current->cred, 1) /** * current_real_cred - Access the current task's objective credentials * * Access the objective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_real_cred() \ rcu_dereference_protected(current->real_cred, 1) /** * __task_cred - Access a task's objective credentials * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ #define __task_cred(task) \ rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials * * Get the subjective credentials of the current task, pinning them so that * they can't go away. Accessing the current task's credentials directly is * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) /** * get_current_user - Get the current task's user_struct * * Get the user record of the current task, pinning it so that it can't go * away. */ #define get_current_user() \ ({ \ struct user_struct *__u; \ const struct cred *__cred; \ __cred = current_cred(); \ __u = get_uid(__cred->user); \ __u; \ }) /** * get_current_groups - Get the current task's supplementary group list * * Get the supplementary group list of the current task, pinning it so that it * can't go away. */ #define get_current_groups() \ ({ \ struct group_info *__groups; \ const struct cred *__cred; \ __cred = current_cred(); \ __groups = get_group_info(__cred->group_info); \ __groups; \ }) #define task_cred_xxx(task, xxx) \ ({ \ __typeof__(((struct cred *)NULL)->xxx) ___val; \ rcu_read_lock(); \ ___val = __task_cred((task))->xxx; \ rcu_read_unlock(); \ ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) #define task_ucounts(task) (task_cred_xxx((task), ucounts)) #define current_cred_xxx(xxx) \ ({ \ current_cred()->xxx; \ }) #define current_uid() (current_cred_xxx(uid)) #define current_gid() (current_cred_xxx(gid)) #define current_euid() (current_cred_xxx(euid)) #define current_egid() (current_cred_xxx(egid)) #define current_suid() (current_cred_xxx(suid)) #define current_sgid() (current_cred_xxx(sgid)) #define current_fsuid() (current_cred_xxx(fsuid)) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) #define current_ucounts() (current_cred_xxx(ucounts)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else static inline struct user_namespace *current_user_ns(void) { return &init_user_ns; } #endif #define current_uid_gid(_uid, _gid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_uid) = __cred->uid; \ *(_gid) = __cred->gid; \ } while(0) #define current_euid_egid(_euid, _egid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_euid) = __cred->euid; \ *(_egid) = __cred->egid; \ } while(0) #define current_fsuid_fsgid(_fsuid, _fsgid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_fsuid) = __cred->fsuid; \ *(_fsgid) = __cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */ |
| 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/drivers/cpufreq/cpufreq.c * * Copyright (C) 2001 Russell King * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> * (C) 2013 Viresh Kumar <viresh.kumar@linaro.org> * * Oct 2005 - Ashok Raj <ashok.raj@intel.com> * Added handling for CPU hotplug * Feb 2006 - Jacob Shin <jacob.shin@amd.com> * Fix handling for CPU hotplug -- affected CPUs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpu.h> #include <linux/cpufreq.h> #include <linux/cpu_cooling.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/pm_qos.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/tick.h> #include <linux/units.h> #include <trace/events/power.h> static LIST_HEAD(cpufreq_policy_list); /* Macros to iterate over CPU policies */ #define for_each_suitable_policy(__policy, __active) \ list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \ if ((__active) == !policy_is_inactive(__policy)) #define for_each_active_policy(__policy) \ for_each_suitable_policy(__policy, true) #define for_each_inactive_policy(__policy) \ for_each_suitable_policy(__policy, false) /* Iterate over governors */ static LIST_HEAD(cpufreq_governor_list); #define for_each_governor(__governor) \ list_for_each_entry(__governor, &cpufreq_governor_list, governor_list) static char default_governor[CPUFREQ_NAME_LEN]; /* * The "cpufreq driver" - the arch- or hardware-dependent low * level driver of CPUFreq support, and its spinlock. This lock * also protects the cpufreq_cpu_data array. */ static struct cpufreq_driver *cpufreq_driver; static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data); static DEFINE_RWLOCK(cpufreq_driver_lock); static DEFINE_STATIC_KEY_FALSE(cpufreq_freq_invariance); bool cpufreq_supports_freq_invariance(void) { return static_branch_likely(&cpufreq_freq_invariance); } /* Flag to suspend/resume CPUFreq governors */ static bool cpufreq_suspended; static inline bool has_target(void) { return cpufreq_driver->target_index || cpufreq_driver->target; } bool has_target_index(void) { return !!cpufreq_driver->target_index; } /* internal prototypes */ static unsigned int __cpufreq_get(struct cpufreq_policy *policy); static int cpufreq_init_governor(struct cpufreq_policy *policy); static void cpufreq_exit_governor(struct cpufreq_policy *policy); static void cpufreq_governor_limits(struct cpufreq_policy *policy); static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_governor *new_gov, unsigned int new_pol); static bool cpufreq_boost_supported(void); static int cpufreq_boost_trigger_state(int state); /* * Two notifier lists: the "policy" list is involved in the * validation process for a new CPU frequency policy; the * "transition" list for kernel code that needs to handle * changes to devices when the CPU clock speed changes. * The mutex locks both lists. */ static BLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list); SRCU_NOTIFIER_HEAD_STATIC(cpufreq_transition_notifier_list); static int off __read_mostly; static int cpufreq_disabled(void) { return off; } void disable_cpufreq(void) { off = 1; } EXPORT_SYMBOL_GPL(disable_cpufreq); static DEFINE_MUTEX(cpufreq_governor_mutex); bool have_governor_per_policy(void) { return !!(cpufreq_driver->flags & CPUFREQ_HAVE_GOVERNOR_PER_POLICY); } EXPORT_SYMBOL_GPL(have_governor_per_policy); static struct kobject *cpufreq_global_kobject; struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) return &policy->kobj; else return cpufreq_global_kobject; } EXPORT_SYMBOL_GPL(get_governor_parent_kobj); static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) { struct kernel_cpustat kcpustat; u64 cur_wall_time; u64 idle_time; u64 busy_time; cur_wall_time = jiffies64_to_nsecs(get_jiffies_64()); kcpustat_cpu_fetch(&kcpustat, cpu); busy_time = kcpustat.cpustat[CPUTIME_USER]; busy_time += kcpustat.cpustat[CPUTIME_SYSTEM]; busy_time += kcpustat.cpustat[CPUTIME_IRQ]; busy_time += kcpustat.cpustat[CPUTIME_SOFTIRQ]; busy_time += kcpustat.cpustat[CPUTIME_STEAL]; busy_time += kcpustat.cpustat[CPUTIME_NICE]; idle_time = cur_wall_time - busy_time; if (wall) *wall = div_u64(cur_wall_time, NSEC_PER_USEC); return div_u64(idle_time, NSEC_PER_USEC); } u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy) { u64 idle_time = get_cpu_idle_time_us(cpu, io_busy ? wall : NULL); if (idle_time == -1ULL) return get_cpu_idle_time_jiffy(cpu, wall); else if (!io_busy) idle_time += get_cpu_iowait_time_us(cpu, wall); return idle_time; } EXPORT_SYMBOL_GPL(get_cpu_idle_time); /* * This is a generic cpufreq init() routine which can be used by cpufreq * drivers of SMP systems. It will do following: * - validate & show freq table passed * - set policies transition latency * - policy->cpus with all possible CPUs */ void cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency) { policy->freq_table = table; policy->cpuinfo.transition_latency = transition_latency; /* * The driver only supports the SMP configuration where all processors * share the clock and voltage and clock. */ cpumask_setall(policy->cpus); } EXPORT_SYMBOL_GPL(cpufreq_generic_init); struct cpufreq_policy *cpufreq_cpu_get_raw(unsigned int cpu) { struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu); return policy && cpumask_test_cpu(cpu, policy->cpus) ? policy : NULL; } EXPORT_SYMBOL_GPL(cpufreq_cpu_get_raw); unsigned int cpufreq_generic_get(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get_raw(cpu); if (!policy || IS_ERR(policy->clk)) { pr_err("%s: No %s associated to cpu: %d\n", __func__, policy ? "clk" : "policy", cpu); return 0; } return clk_get_rate(policy->clk) / 1000; } EXPORT_SYMBOL_GPL(cpufreq_generic_get); /** * cpufreq_cpu_get - Return policy for a CPU and mark it as busy. * @cpu: CPU to find the policy for. * * Call cpufreq_cpu_get_raw() to obtain a cpufreq policy for @cpu and increment * the kobject reference counter of that policy. Return a valid policy on * success or NULL on failure. * * The policy returned by this function has to be released with the help of * cpufreq_cpu_put() to balance its kobject reference counter properly. */ struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu) { struct cpufreq_policy *policy = NULL; unsigned long flags; if (WARN_ON(cpu >= nr_cpu_ids)) return NULL; /* get the cpufreq driver */ read_lock_irqsave(&cpufreq_driver_lock, flags); if (cpufreq_driver) { /* get the CPU */ policy = cpufreq_cpu_get_raw(cpu); if (policy) kobject_get(&policy->kobj); } read_unlock_irqrestore(&cpufreq_driver_lock, flags); return policy; } EXPORT_SYMBOL_GPL(cpufreq_cpu_get); /** * cpufreq_cpu_put - Decrement kobject usage counter for cpufreq policy. * @policy: cpufreq policy returned by cpufreq_cpu_get(). */ void cpufreq_cpu_put(struct cpufreq_policy *policy) { kobject_put(&policy->kobj); } EXPORT_SYMBOL_GPL(cpufreq_cpu_put); /********************************************************************* * EXTERNALLY AFFECTING FREQUENCY CHANGES * *********************************************************************/ /** * adjust_jiffies - Adjust the system "loops_per_jiffy". * @val: CPUFREQ_PRECHANGE or CPUFREQ_POSTCHANGE. * @ci: Frequency change information. * * This function alters the system "loops_per_jiffy" for the clock * speed change. Note that loops_per_jiffy cannot be updated on SMP * systems as each CPU might be scaled differently. So, use the arch * per-CPU loops_per_jiffy value wherever possible. */ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) { #ifndef CONFIG_SMP static unsigned long l_p_j_ref; static unsigned int l_p_j_ref_freq; if (ci->flags & CPUFREQ_CONST_LOOPS) return; if (!l_p_j_ref_freq) { l_p_j_ref = loops_per_jiffy; l_p_j_ref_freq = ci->old; pr_debug("saving %lu as reference value for loops_per_jiffy; freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq); } if (val == CPUFREQ_POSTCHANGE && ci->old != ci->new) { loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq, ci->new); pr_debug("scaling loops_per_jiffy to %lu for frequency %u kHz\n", loops_per_jiffy, ci->new); } #endif } /** * cpufreq_notify_transition - Notify frequency transition and adjust jiffies. * @policy: cpufreq policy to enable fast frequency switching for. * @freqs: contain details of the frequency update. * @state: set to CPUFREQ_PRECHANGE or CPUFREQ_POSTCHANGE. * * This function calls the transition notifiers and adjust_jiffies(). * * It is called twice on all CPU frequency changes that have external effects. */ static void cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { int cpu; BUG_ON(irqs_disabled()); if (cpufreq_disabled()) return; freqs->policy = policy; freqs->flags = cpufreq_driver->flags; pr_debug("notification %u of frequency transition to %u kHz\n", state, freqs->new); switch (state) { case CPUFREQ_PRECHANGE: /* * Detect if the driver reported a value as "old frequency" * which is not equal to what the cpufreq core thinks is * "old frequency". */ if (policy->cur && policy->cur != freqs->old) { pr_debug("Warning: CPU frequency is %u, cpufreq assumed %u kHz\n", freqs->old, policy->cur); freqs->old = policy->cur; } srcu_notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_PRECHANGE, freqs); adjust_jiffies(CPUFREQ_PRECHANGE, freqs); break; case CPUFREQ_POSTCHANGE: adjust_jiffies(CPUFREQ_POSTCHANGE, freqs); pr_debug("FREQ: %u - CPUs: %*pbl\n", freqs->new, cpumask_pr_args(policy->cpus)); for_each_cpu(cpu, policy->cpus) trace_cpu_frequency(freqs->new, cpu); srcu_notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_POSTCHANGE, freqs); cpufreq_stats_record_transition(policy, freqs->new); policy->cur = freqs->new; } } /* Do post notifications when there are chances that transition has failed */ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int transition_failed) { cpufreq_notify_transition(policy, freqs, CPUFREQ_POSTCHANGE); if (!transition_failed) return; swap(freqs->old, freqs->new); cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(policy, freqs, CPUFREQ_POSTCHANGE); } void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { /* * Catch double invocations of _begin() which lead to self-deadlock. * ASYNC_NOTIFICATION drivers are left out because the cpufreq core * doesn't invoke _begin() on their behalf, and hence the chances of * double invocations are very low. Moreover, there are scenarios * where these checks can emit false-positive warnings in these * drivers; so we avoid that by skipping them altogether. */ WARN_ON(!(cpufreq_driver->flags & CPUFREQ_ASYNC_NOTIFICATION) && current == policy->transition_task); wait: wait_event(policy->transition_wait, !policy->transition_ongoing); spin_lock(&policy->transition_lock); if (unlikely(policy->transition_ongoing)) { spin_unlock(&policy->transition_lock); goto wait; } policy->transition_ongoing = true; policy->transition_task = current; spin_unlock(&policy->transition_lock); cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); void cpufreq_freq_transition_end(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int transition_failed) { if (WARN_ON(!policy->transition_ongoing)) return; cpufreq_notify_post_transition(policy, freqs, transition_failed); arch_set_freq_scale(policy->related_cpus, policy->cur, arch_scale_freq_ref(policy->cpu)); spin_lock(&policy->transition_lock); policy->transition_ongoing = false; policy->transition_task = NULL; spin_unlock(&policy->transition_lock); wake_up(&policy->transition_wait); } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_end); /* * Fast frequency switching status count. Positive means "enabled", negative * means "disabled" and 0 means "not decided yet". */ static int cpufreq_fast_switch_count; static DEFINE_MUTEX(cpufreq_fast_switch_lock); static void cpufreq_list_transition_notifiers(void) { struct notifier_block *nb; pr_info("Registered transition notifiers:\n"); mutex_lock(&cpufreq_transition_notifier_list.mutex); for (nb = cpufreq_transition_notifier_list.head; nb; nb = nb->next) pr_info("%pS\n", nb->notifier_call); mutex_unlock(&cpufreq_transition_notifier_list.mutex); } /** * cpufreq_enable_fast_switch - Enable fast frequency switching for policy. * @policy: cpufreq policy to enable fast frequency switching for. * * Try to enable fast frequency switching for @policy. * * The attempt will fail if there is at least one transition notifier registered * at this point, as fast frequency switching is quite fundamentally at odds * with transition notifiers. Thus if successful, it will make registration of * transition notifiers fail going forward. */ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy) { lockdep_assert_held(&policy->rwsem); if (!policy->fast_switch_possible) return; mutex_lock(&cpufreq_fast_switch_lock); if (cpufreq_fast_switch_count >= 0) { cpufreq_fast_switch_count++; policy->fast_switch_enabled = true; } else { pr_warn("CPU%u: Fast frequency switching not enabled\n", policy->cpu); cpufreq_list_transition_notifiers(); } mutex_unlock(&cpufreq_fast_switch_lock); } EXPORT_SYMBOL_GPL(cpufreq_enable_fast_switch); /** * cpufreq_disable_fast_switch - Disable fast frequency switching for policy. * @policy: cpufreq policy to disable fast frequency switching for. */ void cpufreq_disable_fast_switch(struct cpufreq_policy *policy) { mutex_lock(&cpufreq_fast_switch_lock); if (policy->fast_switch_enabled) { policy->fast_switch_enabled = false; if (!WARN_ON(cpufreq_fast_switch_count <= 0)) cpufreq_fast_switch_count--; } mutex_unlock(&cpufreq_fast_switch_lock); } EXPORT_SYMBOL_GPL(cpufreq_disable_fast_switch); static unsigned int __resolve_freq(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int min, unsigned int max, unsigned int relation) { unsigned int idx; target_freq = clamp_val(target_freq, min, max); if (!policy->freq_table) return target_freq; idx = cpufreq_frequency_table_target(policy, target_freq, min, max, relation); policy->cached_resolved_idx = idx; policy->cached_target_freq = target_freq; return policy->freq_table[idx].frequency; } /** * cpufreq_driver_resolve_freq - Map a target frequency to a driver-supported * one. * @policy: associated policy to interrogate * @target_freq: target frequency to resolve. * * The target to driver frequency mapping is cached in the policy. * * Return: Lowest driver-supported frequency greater than or equal to the * given target_freq, subject to policy (min/max) and driver limitations. */ unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy, unsigned int target_freq) { unsigned int min = READ_ONCE(policy->min); unsigned int max = READ_ONCE(policy->max); /* * If this function runs in parallel with cpufreq_set_policy(), it may * read policy->min before the update and policy->max after the update * or the other way around, so there is no ordering guarantee. * * Resolve this by always honoring the max (in case it comes from * thermal throttling or similar). */ if (unlikely(min > max)) min = max; return __resolve_freq(policy, target_freq, min, max, CPUFREQ_RELATION_LE); } EXPORT_SYMBOL_GPL(cpufreq_driver_resolve_freq); unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy) { unsigned int latency; if (policy->transition_delay_us) return policy->transition_delay_us; latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC; if (latency) /* Give a 50% breathing room between updates */ return latency + (latency >> 1); return USEC_PER_MSEC; } EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us); /********************************************************************* * SYSFS INTERFACE * *********************************************************************/ static ssize_t show_boost(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", cpufreq_driver->boost_enabled); } static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { bool enable; if (kstrtobool(buf, &enable)) return -EINVAL; if (cpufreq_boost_trigger_state(enable)) { pr_err("%s: Cannot %s BOOST!\n", __func__, str_enable_disable(enable)); return -EINVAL; } pr_debug("%s: cpufreq BOOST %s\n", __func__, str_enabled_disabled(enable)); return count; } define_one_global_rw(boost); static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf) { return sysfs_emit(buf, "%d\n", policy->boost_enabled); } static int policy_set_boost(struct cpufreq_policy *policy, bool enable) { int ret; if (policy->boost_enabled == enable) return 0; policy->boost_enabled = enable; ret = cpufreq_driver->set_boost(policy, enable); if (ret) policy->boost_enabled = !policy->boost_enabled; return ret; } static ssize_t store_local_boost(struct cpufreq_policy *policy, const char *buf, size_t count) { int ret; bool enable; if (kstrtobool(buf, &enable)) return -EINVAL; if (!cpufreq_driver->boost_enabled) return -EINVAL; if (!policy->boost_supported) return -EINVAL; ret = policy_set_boost(policy, enable); if (!ret) return count; return ret; } static struct freq_attr local_boost = __ATTR(boost, 0644, show_local_boost, store_local_boost); static struct cpufreq_governor *find_governor(const char *str_governor) { struct cpufreq_governor *t; for_each_governor(t) if (!strncasecmp(str_governor, t->name, CPUFREQ_NAME_LEN)) return t; return NULL; } static struct cpufreq_governor *get_governor(const char *str_governor) { struct cpufreq_governor *t; mutex_lock(&cpufreq_governor_mutex); t = find_governor(str_governor); if (!t) goto unlock; if (!try_module_get(t->owner)) t = NULL; unlock: mutex_unlock(&cpufreq_governor_mutex); return t; } static unsigned int cpufreq_parse_policy(char *str_governor) { if (!strncasecmp(str_governor, "performance", CPUFREQ_NAME_LEN)) return CPUFREQ_POLICY_PERFORMANCE; if (!strncasecmp(str_governor, "powersave", CPUFREQ_NAME_LEN)) return CPUFREQ_POLICY_POWERSAVE; return CPUFREQ_POLICY_UNKNOWN; } /** * cpufreq_parse_governor - parse a governor string only for has_target() * @str_governor: Governor name. */ static struct cpufreq_governor *cpufreq_parse_governor(char *str_governor) { struct cpufreq_governor *t; t = get_governor(str_governor); if (t) return t; if (request_module("cpufreq_%s", str_governor)) return NULL; return get_governor(str_governor); } /* * cpufreq_per_cpu_attr_read() / show_##file_name() - * print out cpufreq information * * Write out information from cpufreq_driver->policy[cpu]; object must be * "unsigned int". */ #define show_one(file_name, object) \ static ssize_t show_##file_name \ (struct cpufreq_policy *policy, char *buf) \ { \ return sysfs_emit(buf, "%u\n", policy->object); \ } show_one(cpuinfo_min_freq, cpuinfo.min_freq); show_one(cpuinfo_max_freq, cpuinfo.max_freq); show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); __weak int arch_freq_get_on_cpu(int cpu) { return -EOPNOTSUPP; } static inline bool cpufreq_avg_freq_supported(struct cpufreq_policy *policy) { return arch_freq_get_on_cpu(policy->cpu) != -EOPNOTSUPP; } static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) { ssize_t ret; int freq; freq = IS_ENABLED(CONFIG_CPUFREQ_ARCH_CUR_FREQ) ? arch_freq_get_on_cpu(policy->cpu) : 0; if (freq > 0) ret = sysfs_emit(buf, "%u\n", freq); else if (cpufreq_driver->setpolicy && cpufreq_driver->get) ret = sysfs_emit(buf, "%u\n", cpufreq_driver->get(policy->cpu)); else ret = sysfs_emit(buf, "%u\n", policy->cur); return ret; } /* * cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access */ #define store_one(file_name, object) \ static ssize_t store_##file_name \ (struct cpufreq_policy *policy, const char *buf, size_t count) \ { \ unsigned long val; \ int ret; \ \ ret = kstrtoul(buf, 0, &val); \ if (ret) \ return ret; \ \ ret = freq_qos_update_request(policy->object##_freq_req, val);\ return ret >= 0 ? count : ret; \ } store_one(scaling_min_freq, min); store_one(scaling_max_freq, max); /* * show_cpuinfo_cur_freq - current CPU frequency as detected by hardware */ static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy, char *buf) { unsigned int cur_freq = __cpufreq_get(policy); if (cur_freq) return sysfs_emit(buf, "%u\n", cur_freq); return sysfs_emit(buf, "<unknown>\n"); } /* * show_cpuinfo_avg_freq - average CPU frequency as detected by hardware */ static ssize_t show_cpuinfo_avg_freq(struct cpufreq_policy *policy, char *buf) { int avg_freq = arch_freq_get_on_cpu(policy->cpu); if (avg_freq > 0) return sysfs_emit(buf, "%u\n", avg_freq); return avg_freq != 0 ? avg_freq : -EINVAL; } /* * show_scaling_governor - show the current policy for the specified CPU */ static ssize_t show_scaling_governor(struct cpufreq_policy *policy, char *buf) { if (policy->policy == CPUFREQ_POLICY_POWERSAVE) return sysfs_emit(buf, "powersave\n"); else if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) return sysfs_emit(buf, "performance\n"); else if (policy->governor) return sysfs_emit(buf, "%s\n", policy->governor->name); return -EINVAL; } /* * store_scaling_governor - store policy for the specified CPU */ static ssize_t store_scaling_governor(struct cpufreq_policy *policy, const char *buf, size_t count) { char str_governor[CPUFREQ_NAME_LEN]; int ret; ret = sscanf(buf, "%15s", str_governor); if (ret != 1) return -EINVAL; if (cpufreq_driver->setpolicy) { unsigned int new_pol; new_pol = cpufreq_parse_policy(str_governor); if (!new_pol) return -EINVAL; ret = cpufreq_set_policy(policy, NULL, new_pol); } else { struct cpufreq_governor *new_gov; new_gov = cpufreq_parse_governor(str_governor); if (!new_gov) return -EINVAL; ret = cpufreq_set_policy(policy, new_gov, CPUFREQ_POLICY_UNKNOWN); module_put(new_gov->owner); } return ret ? ret : count; } /* * show_scaling_driver - show the cpufreq driver currently loaded */ static ssize_t show_scaling_driver(struct cpufreq_policy *policy, char *buf) { return scnprintf(buf, CPUFREQ_NAME_PLEN, "%s\n", cpufreq_driver->name); } /* * show_scaling_available_governors - show the available CPUfreq governors */ static ssize_t show_scaling_available_governors(struct cpufreq_policy *policy, char *buf) { ssize_t i = 0; struct cpufreq_governor *t; if (!has_target()) { i += sysfs_emit(buf, "performance powersave"); goto out; } mutex_lock(&cpufreq_governor_mutex); for_each_governor(t) { if (i >= (ssize_t) ((PAGE_SIZE / sizeof(char)) - (CPUFREQ_NAME_LEN + 2))) break; i += sysfs_emit_at(buf, i, "%s ", t->name); } mutex_unlock(&cpufreq_governor_mutex); out: i += sysfs_emit_at(buf, i, "\n"); return i; } ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf) { ssize_t i = 0; unsigned int cpu; for_each_cpu(cpu, mask) { i += sysfs_emit_at(buf, i, "%u ", cpu); if (i >= (PAGE_SIZE - 5)) break; } /* Remove the extra space at the end */ i--; i += sysfs_emit_at(buf, i, "\n"); return i; } EXPORT_SYMBOL_GPL(cpufreq_show_cpus); /* * show_related_cpus - show the CPUs affected by each transition even if * hw coordination is in use */ static ssize_t show_related_cpus(struct cpufreq_policy *policy, char *buf) { return cpufreq_show_cpus(policy->related_cpus, buf); } /* * show_affected_cpus - show the CPUs affected by each transition */ static ssize_t show_affected_cpus(struct cpufreq_policy *policy, char *buf) { return cpufreq_show_cpus(policy->cpus, buf); } static ssize_t store_scaling_setspeed(struct cpufreq_policy *policy, const char *buf, size_t count) { unsigned int freq = 0; unsigned int ret; if (!policy->governor || !policy->governor->store_setspeed) return -EINVAL; ret = kstrtouint(buf, 0, &freq); if (ret) return ret; policy->governor->store_setspeed(policy, freq); return count; } static ssize_t show_scaling_setspeed(struct cpufreq_policy *policy, char *buf) { if (!policy->governor || !policy->governor->show_setspeed) return sysfs_emit(buf, "<unsupported>\n"); return policy->governor->show_setspeed(policy, buf); } /* * show_bios_limit - show the current cpufreq HW/BIOS limitation */ static ssize_t show_bios_limit(struct cpufreq_policy *policy, char *buf) { unsigned int limit; int ret; ret = cpufreq_driver->bios_limit(policy->cpu, &limit); if (!ret) return sysfs_emit(buf, "%u\n", limit); return sysfs_emit(buf, "%u\n", policy->cpuinfo.max_freq); } cpufreq_freq_attr_ro_perm(cpuinfo_cur_freq, 0400); cpufreq_freq_attr_ro(cpuinfo_avg_freq); cpufreq_freq_attr_ro(cpuinfo_min_freq); cpufreq_freq_attr_ro(cpuinfo_max_freq); cpufreq_freq_attr_ro(cpuinfo_transition_latency); cpufreq_freq_attr_ro(scaling_available_governors); cpufreq_freq_attr_ro(scaling_driver); cpufreq_freq_attr_ro(scaling_cur_freq); cpufreq_freq_attr_ro(bios_limit); cpufreq_freq_attr_ro(related_cpus); cpufreq_freq_attr_ro(affected_cpus); cpufreq_freq_attr_rw(scaling_min_freq); cpufreq_freq_attr_rw(scaling_max_freq); cpufreq_freq_attr_rw(scaling_governor); cpufreq_freq_attr_rw(scaling_setspeed); static struct attribute *cpufreq_attrs[] = { &cpuinfo_min_freq.attr, &cpuinfo_max_freq.attr, &cpuinfo_transition_latency.attr, &scaling_cur_freq.attr, &scaling_min_freq.attr, &scaling_max_freq.attr, &affected_cpus.attr, &related_cpus.attr, &scaling_governor.attr, &scaling_driver.attr, &scaling_available_governors.attr, &scaling_setspeed.attr, NULL }; ATTRIBUTE_GROUPS(cpufreq); #define to_policy(k) container_of(k, struct cpufreq_policy, kobj) #define to_attr(a) container_of(a, struct freq_attr, attr) static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); if (!fattr->show) return -EIO; guard(cpufreq_policy_read)(policy); if (likely(!policy_is_inactive(policy))) return fattr->show(policy, buf); return -EBUSY; } static ssize_t store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); if (!fattr->store) return -EIO; guard(cpufreq_policy_write)(policy); if (likely(!policy_is_inactive(policy))) return fattr->store(policy, buf, count); return -EBUSY; } static void cpufreq_sysfs_release(struct kobject *kobj) { struct cpufreq_policy *policy = to_policy(kobj); pr_debug("last reference is dropped\n"); complete(&policy->kobj_unregister); } static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; static const struct kobj_type ktype_cpufreq = { .sysfs_ops = &sysfs_ops, .default_groups = cpufreq_groups, .release = cpufreq_sysfs_release, }; static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu, struct device *dev) { if (unlikely(!dev)) return; if (cpumask_test_and_set_cpu(cpu, policy->real_cpus)) return; dev_dbg(dev, "%s: Adding symlink\n", __func__); if (sysfs_create_link(&dev->kobj, &policy->kobj, "cpufreq")) dev_err(dev, "cpufreq symlink creation failed\n"); } static void remove_cpu_dev_symlink(struct cpufreq_policy *policy, int cpu, struct device *dev) { dev_dbg(dev, "%s: Removing symlink\n", __func__); sysfs_remove_link(&dev->kobj, "cpufreq"); cpumask_clear_cpu(cpu, policy->real_cpus); } static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) { struct freq_attr **drv_attr; int ret = 0; /* Attributes that need freq_table */ if (policy->freq_table) { ret = sysfs_create_file(&policy->kobj, &cpufreq_freq_attr_scaling_available_freqs.attr); if (ret) return ret; if (cpufreq_boost_supported()) { ret = sysfs_create_file(&policy->kobj, &cpufreq_freq_attr_scaling_boost_freqs.attr); if (ret) return ret; } } /* set up files for this cpu device */ drv_attr = cpufreq_driver->attr; while (drv_attr && *drv_attr) { ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr)); if (ret) return ret; drv_attr++; } if (cpufreq_driver->get) { ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr); if (ret) return ret; } if (cpufreq_avg_freq_supported(policy)) { ret = sysfs_create_file(&policy->kobj, &cpuinfo_avg_freq.attr); if (ret) return ret; } if (cpufreq_driver->bios_limit) { ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); if (ret) return ret; } if (cpufreq_boost_supported()) { ret = sysfs_create_file(&policy->kobj, &local_boost.attr); if (ret) return ret; } return 0; } static int cpufreq_init_policy(struct cpufreq_policy *policy) { struct cpufreq_governor *gov = NULL; unsigned int pol = CPUFREQ_POLICY_UNKNOWN; int ret; if (has_target()) { /* Update policy governor to the one used before hotplug. */ gov = get_governor(policy->last_governor); if (gov) { pr_debug("Restoring governor %s for cpu %d\n", gov->name, policy->cpu); } else { gov = get_governor(default_governor); } if (!gov) { gov = cpufreq_default_governor(); __module_get(gov->owner); } } else { /* Use the default policy if there is no last_policy. */ if (policy->last_policy) { pol = policy->last_policy; } else { pol = cpufreq_parse_policy(default_governor); /* * In case the default governor is neither "performance" * nor "powersave", fall back to the initial policy * value set by the driver. */ if (pol == CPUFREQ_POLICY_UNKNOWN) pol = policy->policy; } if (pol != CPUFREQ_POLICY_PERFORMANCE && pol != CPUFREQ_POLICY_POWERSAVE) return -ENODATA; } ret = cpufreq_set_policy(policy, gov, pol); if (gov) module_put(gov->owner); return ret; } static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu) { int ret = 0; /* Has this CPU been taken care of already? */ if (cpumask_test_cpu(cpu, policy->cpus)) return 0; guard(cpufreq_policy_write)(policy); if (has_target()) cpufreq_stop_governor(policy); cpumask_set_cpu(cpu, policy->cpus); if (has_target()) { ret = cpufreq_start_governor(policy); if (ret) pr_err("%s: Failed to start governor\n", __func__); } return ret; } void refresh_frequency_limits(struct cpufreq_policy *policy) { if (!policy_is_inactive(policy)) { pr_debug("updating policy for CPU %u\n", policy->cpu); cpufreq_set_policy(policy, policy->governor, policy->policy); } } EXPORT_SYMBOL(refresh_frequency_limits); static void handle_update(struct work_struct *work) { struct cpufreq_policy *policy = container_of(work, struct cpufreq_policy, update); pr_debug("handle_update for cpu %u called\n", policy->cpu); guard(cpufreq_policy_write)(policy); refresh_frequency_limits(policy); } static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq, void *data) { struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min); schedule_work(&policy->update); return 0; } static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq, void *data) { struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max); schedule_work(&policy->update); return 0; } static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy) { struct kobject *kobj; struct completion *cmp; scoped_guard(cpufreq_policy_write, policy) { cpufreq_stats_free_table(policy); kobj = &policy->kobj; cmp = &policy->kobj_unregister; } kobject_put(kobj); /* * We need to make sure that the underlying kobj is * actually not referenced anymore by anybody before we * proceed with unloading. */ pr_debug("waiting for dropping of refcount\n"); wait_for_completion(cmp); pr_debug("wait complete\n"); } static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu) { struct cpufreq_policy *policy; struct device *dev = get_cpu_device(cpu); int ret; if (!dev) return NULL; policy = kzalloc(sizeof(*policy), GFP_KERNEL); if (!policy) return NULL; if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL)) goto err_free_policy; if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) goto err_free_cpumask; if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL)) goto err_free_rcpumask; init_completion(&policy->kobj_unregister); ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, cpufreq_global_kobject, "policy%u", cpu); if (ret) { dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret); /* * The entire policy object will be freed below, but the extra * memory allocated for the kobject name needs to be freed by * releasing the kobject. */ kobject_put(&policy->kobj); goto err_free_real_cpus; } init_rwsem(&policy->rwsem); freq_constraints_init(&policy->constraints); policy->nb_min.notifier_call = cpufreq_notifier_min; policy->nb_max.notifier_call = cpufreq_notifier_max; ret = freq_qos_add_notifier(&policy->constraints, FREQ_QOS_MIN, &policy->nb_min); if (ret) { dev_err(dev, "Failed to register MIN QoS notifier: %d (CPU%u)\n", ret, cpu); goto err_kobj_remove; } ret = freq_qos_add_notifier(&policy->constraints, FREQ_QOS_MAX, &policy->nb_max); if (ret) { dev_err(dev, "Failed to register MAX QoS notifier: %d (CPU%u)\n", ret, cpu); goto err_min_qos_notifier; } INIT_LIST_HEAD(&policy->policy_list); spin_lock_init(&policy->transition_lock); init_waitqueue_head(&policy->transition_wait); INIT_WORK(&policy->update, handle_update); return policy; err_min_qos_notifier: freq_qos_remove_notifier(&policy->constraints, FREQ_QOS_MIN, &policy->nb_min); err_kobj_remove: cpufreq_policy_put_kobj(policy); err_free_real_cpus: free_cpumask_var(policy->real_cpus); err_free_rcpumask: free_cpumask_var(policy->related_cpus); err_free_cpumask: free_cpumask_var(policy->cpus); err_free_policy: kfree(policy); return NULL; } static void cpufreq_policy_free(struct cpufreq_policy *policy) { unsigned long flags; int cpu; /* * The callers must ensure the policy is inactive by now, to avoid any * races with show()/store() callbacks. */ if (unlikely(!policy_is_inactive(policy))) pr_warn("%s: Freeing active policy\n", __func__); /* Remove policy from list */ write_lock_irqsave(&cpufreq_driver_lock, flags); list_del(&policy->policy_list); for_each_cpu(cpu, policy->related_cpus) per_cpu(cpufreq_cpu_data, cpu) = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); freq_qos_remove_notifier(&policy->constraints, FREQ_QOS_MAX, &policy->nb_max); freq_qos_remove_notifier(&policy->constraints, FREQ_QOS_MIN, &policy->nb_min); /* Cancel any pending policy->update work before freeing the policy. */ cancel_work_sync(&policy->update); if (policy->max_freq_req) { /* * Remove max_freq_req after sending CPUFREQ_REMOVE_POLICY * notification, since CPUFREQ_CREATE_POLICY notification was * sent after adding max_freq_req earlier. */ blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_REMOVE_POLICY, policy); freq_qos_remove_request(policy->max_freq_req); } freq_qos_remove_request(policy->min_freq_req); kfree(policy->min_freq_req); cpufreq_policy_put_kobj(policy); free_cpumask_var(policy->real_cpus); free_cpumask_var(policy->related_cpus); free_cpumask_var(policy->cpus); kfree(policy); } static int cpufreq_policy_online(struct cpufreq_policy *policy, unsigned int cpu, bool new_policy) { unsigned long flags; unsigned int j; int ret; guard(cpufreq_policy_write)(policy); policy->cpu = cpu; policy->governor = NULL; if (!new_policy && cpufreq_driver->online) { /* Recover policy->cpus using related_cpus */ cpumask_copy(policy->cpus, policy->related_cpus); ret = cpufreq_driver->online(policy); if (ret) { pr_debug("%s: %d: initialization failed\n", __func__, __LINE__); goto out_exit_policy; } } else { cpumask_copy(policy->cpus, cpumask_of(cpu)); /* * Call driver. From then on the cpufreq must be able * to accept all calls to ->verify and ->setpolicy for this CPU. */ ret = cpufreq_driver->init(policy); if (ret) { pr_debug("%s: %d: initialization failed\n", __func__, __LINE__); goto out_clear_policy; } /* * The initialization has succeeded and the policy is online. * If there is a problem with its frequency table, take it * offline and drop it. */ ret = cpufreq_table_validate_and_sort(policy); if (ret) goto out_offline_policy; /* related_cpus should at least include policy->cpus. */ cpumask_copy(policy->related_cpus, policy->cpus); } /* * affected cpus must always be the one, which are online. We aren't * managing offline cpus here. */ cpumask_and(policy->cpus, policy->cpus, cpu_online_mask); if (new_policy) { for_each_cpu(j, policy->related_cpus) { per_cpu(cpufreq_cpu_data, j) = policy; add_cpu_dev_symlink(policy, j, get_cpu_device(j)); } policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req), GFP_KERNEL); if (!policy->min_freq_req) { ret = -ENOMEM; goto out_destroy_policy; } ret = freq_qos_add_request(&policy->constraints, policy->min_freq_req, FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); if (ret < 0) { /* * So we don't call freq_qos_remove_request() for an * uninitialized request. */ kfree(policy->min_freq_req); policy->min_freq_req = NULL; goto out_destroy_policy; } /* * This must be initialized right here to avoid calling * freq_qos_remove_request() on uninitialized request in case * of errors. */ policy->max_freq_req = policy->min_freq_req + 1; ret = freq_qos_add_request(&policy->constraints, policy->max_freq_req, FREQ_QOS_MAX, FREQ_QOS_MAX_DEFAULT_VALUE); if (ret < 0) { policy->max_freq_req = NULL; goto out_destroy_policy; } blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); } else { ret = freq_qos_update_request(policy->max_freq_req, policy->max); if (ret < 0) goto out_destroy_policy; } if (cpufreq_driver->get && has_target()) { policy->cur = cpufreq_driver->get(policy->cpu); if (!policy->cur) { ret = -EIO; pr_err("%s: ->get() failed\n", __func__); goto out_destroy_policy; } } /* * Sometimes boot loaders set CPU frequency to a value outside of * frequency table present with cpufreq core. In such cases CPU might be * unstable if it has to run on that frequency for long duration of time * and so its better to set it to a frequency which is specified in * freq-table. This also makes cpufreq stats inconsistent as * cpufreq-stats would fail to register because current frequency of CPU * isn't found in freq-table. * * Because we don't want this change to effect boot process badly, we go * for the next freq which is >= policy->cur ('cur' must be set by now, * otherwise we will end up setting freq to lowest of the table as 'cur' * is initialized to zero). * * We are passing target-freq as "policy->cur - 1" otherwise * __cpufreq_driver_target() would simply fail, as policy->cur will be * equal to target-freq. */ if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK) && has_target()) { unsigned int old_freq = policy->cur; /* Are we running at unknown frequency ? */ ret = cpufreq_frequency_table_get_index(policy, old_freq); if (ret == -EINVAL) { ret = __cpufreq_driver_target(policy, old_freq - 1, CPUFREQ_RELATION_L); /* * Reaching here after boot in a few seconds may not * mean that system will remain stable at "unknown" * frequency for longer duration. Hence, a BUG_ON(). */ BUG_ON(ret); pr_info("%s: CPU%d: Running at unlisted initial frequency: %u kHz, changing to: %u kHz\n", __func__, policy->cpu, old_freq, policy->cur); } } if (new_policy) { ret = cpufreq_add_dev_interface(policy); if (ret) goto out_destroy_policy; cpufreq_stats_create_table(policy); write_lock_irqsave(&cpufreq_driver_lock, flags); list_add(&policy->policy_list, &cpufreq_policy_list); write_unlock_irqrestore(&cpufreq_driver_lock, flags); /* * Register with the energy model before * em_rebuild_sched_domains() is called, which will result * in rebuilding of the sched domains, which should only be done * once the energy model is properly initialized for the policy * first. * * Also, this should be called before the policy is registered * with cooling framework. */ if (cpufreq_driver->register_em) cpufreq_driver->register_em(policy); } ret = cpufreq_init_policy(policy); if (ret) { pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n", __func__, cpu, ret); goto out_destroy_policy; } return 0; out_destroy_policy: for_each_cpu(j, policy->real_cpus) remove_cpu_dev_symlink(policy, j, get_cpu_device(j)); out_offline_policy: if (cpufreq_driver->offline) cpufreq_driver->offline(policy); out_exit_policy: if (cpufreq_driver->exit) cpufreq_driver->exit(policy); out_clear_policy: cpumask_clear(policy->cpus); return ret; } static int cpufreq_online(unsigned int cpu) { struct cpufreq_policy *policy; bool new_policy; int ret; pr_debug("%s: bringing CPU%u online\n", __func__, cpu); /* Check if this CPU already has a policy to manage it */ policy = per_cpu(cpufreq_cpu_data, cpu); if (policy) { WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus)); if (!policy_is_inactive(policy)) return cpufreq_add_policy_cpu(policy, cpu); /* This is the only online CPU for the policy. Start over. */ new_policy = false; } else { new_policy = true; policy = cpufreq_policy_alloc(cpu); if (!policy) return -ENOMEM; } ret = cpufreq_policy_online(policy, cpu, new_policy); if (ret) { cpufreq_policy_free(policy); return ret; } kobject_uevent(&policy->kobj, KOBJ_ADD); /* Callback for handling stuff after policy is ready */ if (cpufreq_driver->ready) cpufreq_driver->ready(policy); /* Register cpufreq cooling only for a new policy */ if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); /* * Let the per-policy boost flag mirror the cpufreq_driver boost during * initialization for a new policy. For an existing policy, maintain the * previous boost value unless global boost is disabled. */ if (cpufreq_driver->set_boost && policy->boost_supported && (new_policy || !cpufreq_boost_enabled())) { ret = policy_set_boost(policy, cpufreq_boost_enabled()); if (ret) { /* If the set_boost fails, the online operation is not affected */ pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu, str_enable_disable(cpufreq_boost_enabled())); } } pr_debug("initialization complete\n"); return 0; } /** * cpufreq_add_dev - the cpufreq interface for a CPU device. * @dev: CPU device. * @sif: Subsystem interface structure pointer (not used) */ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) { struct cpufreq_policy *policy; unsigned cpu = dev->id; int ret; dev_dbg(dev, "%s: adding CPU%u\n", __func__, cpu); if (cpu_online(cpu)) { ret = cpufreq_online(cpu); if (ret) return ret; } /* Create sysfs link on CPU registration */ policy = per_cpu(cpufreq_cpu_data, cpu); if (policy) add_cpu_dev_symlink(policy, cpu, dev); return 0; } static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) { int ret; if (has_target()) cpufreq_stop_governor(policy); cpumask_clear_cpu(cpu, policy->cpus); if (!policy_is_inactive(policy)) { /* Nominate a new CPU if necessary. */ if (cpu == policy->cpu) policy->cpu = cpumask_any(policy->cpus); /* Start the governor again for the active policy. */ if (has_target()) { ret = cpufreq_start_governor(policy); if (ret) pr_err("%s: Failed to start governor\n", __func__); } return; } if (has_target()) { strscpy(policy->last_governor, policy->governor->name, CPUFREQ_NAME_LEN); cpufreq_exit_governor(policy); } else { policy->last_policy = policy->policy; } /* * Perform the ->offline() during light-weight tear-down, as * that allows fast recovery when the CPU comes back. */ if (cpufreq_driver->offline) { cpufreq_driver->offline(policy); return; } if (cpufreq_driver->exit) cpufreq_driver->exit(policy); policy->freq_table = NULL; } static int cpufreq_offline(unsigned int cpu) { struct cpufreq_policy *policy; pr_debug("%s: unregistering CPU %u\n", __func__, cpu); policy = cpufreq_cpu_get_raw(cpu); if (!policy) { pr_debug("%s: No cpu_data found\n", __func__); return 0; } guard(cpufreq_policy_write)(policy); __cpufreq_offline(cpu, policy); return 0; } /* * cpufreq_remove_dev - remove a CPU device * * Removes the cpufreq interface for a CPU device. */ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) { unsigned int cpu = dev->id; struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu); if (!policy) return; scoped_guard(cpufreq_policy_write, policy) { if (cpu_online(cpu)) __cpufreq_offline(cpu, policy); remove_cpu_dev_symlink(policy, cpu, dev); if (!cpumask_empty(policy->real_cpus)) return; /* * Unregister cpufreq cooling once all the CPUs of the policy * are removed. */ if (cpufreq_thermal_control_enabled(cpufreq_driver)) { cpufreq_cooling_unregister(policy->cdev); policy->cdev = NULL; } /* We did light-weight exit earlier, do full tear down now */ if (cpufreq_driver->offline && cpufreq_driver->exit) cpufreq_driver->exit(policy); } cpufreq_policy_free(policy); } /** * cpufreq_out_of_sync - Fix up actual and saved CPU frequency difference. * @policy: Policy managing CPUs. * @new_freq: New CPU frequency. * * Adjust to the current frequency first and clean up later by either calling * cpufreq_update_policy(), or scheduling handle_update(). */ static void cpufreq_out_of_sync(struct cpufreq_policy *policy, unsigned int new_freq) { struct cpufreq_freqs freqs; pr_debug("Warning: CPU frequency out of sync: cpufreq and timing core thinks of %u, is %u kHz\n", policy->cur, new_freq); freqs.old = policy->cur; freqs.new = new_freq; cpufreq_freq_transition_begin(policy, &freqs); cpufreq_freq_transition_end(policy, &freqs, 0); } static unsigned int cpufreq_verify_current_freq(struct cpufreq_policy *policy, bool update) { unsigned int new_freq; if (!cpufreq_driver->get) return 0; new_freq = cpufreq_driver->get(policy->cpu); if (!new_freq) return 0; /* * If fast frequency switching is used with the given policy, the check * against policy->cur is pointless, so skip it in that case. */ if (policy->fast_switch_enabled || !has_target()) return new_freq; if (policy->cur != new_freq) { /* * For some platforms, the frequency returned by hardware may be * slightly different from what is provided in the frequency * table, for example hardware may return 499 MHz instead of 500 * MHz. In such cases it is better to avoid getting into * unnecessary frequency updates. */ if (abs(policy->cur - new_freq) < KHZ_PER_MHZ) return policy->cur; cpufreq_out_of_sync(policy, new_freq); if (update) schedule_work(&policy->update); } return new_freq; } /** * cpufreq_quick_get - get the CPU frequency (in kHz) from policy->cur * @cpu: CPU number * * This is the last known freq, without actually getting it from the driver. * Return value will be same as what is shown in scaling_cur_freq in sysfs. */ unsigned int cpufreq_quick_get(unsigned int cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; unsigned long flags; read_lock_irqsave(&cpufreq_driver_lock, flags); if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) { unsigned int ret_freq = cpufreq_driver->get(cpu); read_unlock_irqrestore(&cpufreq_driver_lock, flags); return ret_freq; } read_unlock_irqrestore(&cpufreq_driver_lock, flags); policy = cpufreq_cpu_get(cpu); if (policy) return policy->cur; return 0; } EXPORT_SYMBOL(cpufreq_quick_get); /** * cpufreq_quick_get_max - get the max reported CPU frequency for this CPU * @cpu: CPU number * * Just return the max possible frequency for a given CPU. */ unsigned int cpufreq_quick_get_max(unsigned int cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy); policy = cpufreq_cpu_get(cpu); if (policy) return policy->max; return 0; } EXPORT_SYMBOL(cpufreq_quick_get_max); /** * cpufreq_get_hw_max_freq - get the max hardware frequency of the CPU * @cpu: CPU number * * The default return value is the max_freq field of cpuinfo. */ __weak unsigned int cpufreq_get_hw_max_freq(unsigned int cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy); policy = cpufreq_cpu_get(cpu); if (policy) return policy->cpuinfo.max_freq; return 0; } EXPORT_SYMBOL(cpufreq_get_hw_max_freq); static unsigned int __cpufreq_get(struct cpufreq_policy *policy) { if (unlikely(policy_is_inactive(policy))) return 0; return cpufreq_verify_current_freq(policy, true); } /** * cpufreq_get - get the current CPU frequency (in kHz) * @cpu: CPU number * * Get the CPU current (static) CPU frequency */ unsigned int cpufreq_get(unsigned int cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy); policy = cpufreq_cpu_get(cpu); if (!policy) return 0; guard(cpufreq_policy_read)(policy); return __cpufreq_get(policy); } EXPORT_SYMBOL(cpufreq_get); static struct subsys_interface cpufreq_interface = { .name = "cpufreq", .subsys = &cpu_subsys, .add_dev = cpufreq_add_dev, .remove_dev = cpufreq_remove_dev, }; /* * In case platform wants some specific frequency to be configured * during suspend.. */ int cpufreq_generic_suspend(struct cpufreq_policy *policy) { int ret; if (!policy->suspend_freq) { pr_debug("%s: suspend_freq not defined\n", __func__); return 0; } pr_debug("%s: Setting suspend-freq: %u\n", __func__, policy->suspend_freq); ret = __cpufreq_driver_target(policy, policy->suspend_freq, CPUFREQ_RELATION_H); if (ret) pr_err("%s: unable to set suspend-freq: %u. err: %d\n", __func__, policy->suspend_freq, ret); return ret; } EXPORT_SYMBOL(cpufreq_generic_suspend); /** * cpufreq_suspend() - Suspend CPUFreq governors. * * Called during system wide Suspend/Hibernate cycles for suspending governors * as some platforms can't change frequency after this point in suspend cycle. * Because some of the devices (like: i2c, regulators, etc) they use for * changing frequency are suspended quickly after this point. */ void cpufreq_suspend(void) { struct cpufreq_policy *policy; if (!cpufreq_driver) return; if (!has_target() && !cpufreq_driver->suspend) goto suspend; pr_debug("%s: Suspending Governors\n", __func__); for_each_active_policy(policy) { if (has_target()) { scoped_guard(cpufreq_policy_write, policy) { cpufreq_stop_governor(policy); } } if (cpufreq_driver->suspend && cpufreq_driver->suspend(policy)) pr_err("%s: Failed to suspend driver: %s\n", __func__, cpufreq_driver->name); } suspend: cpufreq_suspended = true; } /** * cpufreq_resume() - Resume CPUFreq governors. * * Called during system wide Suspend/Hibernate cycle for resuming governors that * are suspended with cpufreq_suspend(). */ void cpufreq_resume(void) { struct cpufreq_policy *policy; int ret; if (!cpufreq_driver) return; if (unlikely(!cpufreq_suspended)) return; cpufreq_suspended = false; if (!has_target() && !cpufreq_driver->resume) return; pr_debug("%s: Resuming Governors\n", __func__); for_each_active_policy(policy) { if (cpufreq_driver->resume && cpufreq_driver->resume(policy)) { pr_err("%s: Failed to resume driver: %s\n", __func__, cpufreq_driver->name); } else if (has_target()) { scoped_guard(cpufreq_policy_write, policy) { ret = cpufreq_start_governor(policy); } if (ret) pr_err("%s: Failed to start governor for CPU%u's policy\n", __func__, policy->cpu); } } } /** * cpufreq_driver_test_flags - Test cpufreq driver's flags against given ones. * @flags: Flags to test against the current cpufreq driver's flags. * * Assumes that the driver is there, so callers must ensure that this is the * case. */ bool cpufreq_driver_test_flags(u16 flags) { return !!(cpufreq_driver->flags & flags); } /** * cpufreq_get_current_driver - Return the current driver's name. * * Return the name string of the currently registered cpufreq driver or NULL if * none. */ const char *cpufreq_get_current_driver(void) { if (cpufreq_driver) return cpufreq_driver->name; return NULL; } EXPORT_SYMBOL_GPL(cpufreq_get_current_driver); /** * cpufreq_get_driver_data - Return current driver data. * * Return the private data of the currently registered cpufreq driver, or NULL * if no cpufreq driver has been registered. */ void *cpufreq_get_driver_data(void) { if (cpufreq_driver) return cpufreq_driver->driver_data; return NULL; } EXPORT_SYMBOL_GPL(cpufreq_get_driver_data); /********************************************************************* * NOTIFIER LISTS INTERFACE * *********************************************************************/ /** * cpufreq_register_notifier - Register a notifier with cpufreq. * @nb: notifier function to register. * @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER. * * Add a notifier to one of two lists: either a list of notifiers that run on * clock rate changes (once before and once after every transition), or a list * of notifiers that ron on cpufreq policy changes. * * This function may sleep and it has the same return values as * blocking_notifier_chain_register(). */ int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list) { int ret; if (cpufreq_disabled()) return -EINVAL; switch (list) { case CPUFREQ_TRANSITION_NOTIFIER: mutex_lock(&cpufreq_fast_switch_lock); if (cpufreq_fast_switch_count > 0) { mutex_unlock(&cpufreq_fast_switch_lock); return -EBUSY; } ret = srcu_notifier_chain_register( &cpufreq_transition_notifier_list, nb); if (!ret) cpufreq_fast_switch_count--; mutex_unlock(&cpufreq_fast_switch_lock); break; case CPUFREQ_POLICY_NOTIFIER: ret = blocking_notifier_chain_register( &cpufreq_policy_notifier_list, nb); break; default: ret = -EINVAL; } return ret; } EXPORT_SYMBOL(cpufreq_register_notifier); /** * cpufreq_unregister_notifier - Unregister a notifier from cpufreq. * @nb: notifier block to be unregistered. * @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER. * * Remove a notifier from one of the cpufreq notifier lists. * * This function may sleep and it has the same return values as * blocking_notifier_chain_unregister(). */ int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list) { int ret; if (cpufreq_disabled()) return -EINVAL; switch (list) { case CPUFREQ_TRANSITION_NOTIFIER: mutex_lock(&cpufreq_fast_switch_lock); ret = srcu_notifier_chain_unregister( &cpufreq_transition_notifier_list, nb); if (!ret && !WARN_ON(cpufreq_fast_switch_count >= 0)) cpufreq_fast_switch_count++; mutex_unlock(&cpufreq_fast_switch_lock); break; case CPUFREQ_POLICY_NOTIFIER: ret = blocking_notifier_chain_unregister( &cpufreq_policy_notifier_list, nb); break; default: ret = -EINVAL; } return ret; } EXPORT_SYMBOL(cpufreq_unregister_notifier); /********************************************************************* * GOVERNORS * *********************************************************************/ /** * cpufreq_driver_fast_switch - Carry out a fast CPU frequency switch. * @policy: cpufreq policy to switch the frequency for. * @target_freq: New frequency to set (may be approximate). * * Carry out a fast frequency switch without sleeping. * * The driver's ->fast_switch() callback invoked by this function must be * suitable for being called from within RCU-sched read-side critical sections * and it is expected to select the minimum available frequency greater than or * equal to @target_freq (CPUFREQ_RELATION_L). * * This function must not be called if policy->fast_switch_enabled is unset. * * Governors calling this function must guarantee that it will never be invoked * twice in parallel for the same policy and that it will never be called in * parallel with either ->target() or ->target_index() for the same policy. * * Returns the actual frequency set for the CPU. * * If 0 is returned by the driver's ->fast_switch() callback to indicate an * error condition, the hardware configuration must be preserved. */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { unsigned int freq; int cpu; target_freq = clamp_val(target_freq, policy->min, policy->max); freq = cpufreq_driver->fast_switch(policy, target_freq); if (!freq) return 0; policy->cur = freq; arch_set_freq_scale(policy->related_cpus, freq, arch_scale_freq_ref(policy->cpu)); cpufreq_stats_record_transition(policy, freq); if (trace_cpu_frequency_enabled()) { for_each_cpu(cpu, policy->cpus) trace_cpu_frequency(freq, cpu); } return freq; } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); /** * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. * @cpu: Target CPU. * @min_perf: Minimum (required) performance level (units of @capacity). * @target_perf: Target (desired) performance level (units of @capacity). * @capacity: Capacity of the target CPU. * * Carry out a fast performance level switch of @cpu without sleeping. * * The driver's ->adjust_perf() callback invoked by this function must be * suitable for being called from within RCU-sched read-side critical sections * and it is expected to select a suitable performance level equal to or above * @min_perf and preferably equal to or below @target_perf. * * This function must not be called if policy->fast_switch_enabled is unset. * * Governors calling this function must guarantee that it will never be invoked * twice in parallel for the same CPU and that it will never be called in * parallel with either ->target() or ->target_index() or ->fast_switch() for * the same CPU. */ void cpufreq_driver_adjust_perf(unsigned int cpu, unsigned long min_perf, unsigned long target_perf, unsigned long capacity) { cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity); } /** * cpufreq_driver_has_adjust_perf - Check "direct fast switch" callback. * * Return 'true' if the ->adjust_perf callback is present for the * current driver or 'false' otherwise. */ bool cpufreq_driver_has_adjust_perf(void) { return !!cpufreq_driver->adjust_perf; } /* Must set freqs->new to intermediate frequency */ static int __target_intermediate(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int index) { int ret; freqs->new = cpufreq_driver->get_intermediate(policy, index); /* We don't need to switch to intermediate freq */ if (!freqs->new) return 0; pr_debug("%s: cpu: %d, switching to intermediate freq: oldfreq: %u, intermediate freq: %u\n", __func__, policy->cpu, freqs->old, freqs->new); cpufreq_freq_transition_begin(policy, freqs); ret = cpufreq_driver->target_intermediate(policy, index); cpufreq_freq_transition_end(policy, freqs, ret); if (ret) pr_err("%s: Failed to change to intermediate frequency: %d\n", __func__, ret); return ret; } static int __target_index(struct cpufreq_policy *policy, int index) { struct cpufreq_freqs freqs = {.old = policy->cur, .flags = 0}; unsigned int restore_freq, intermediate_freq = 0; unsigned int newfreq = policy->freq_table[index].frequency; int retval = -EINVAL; bool notify; if (newfreq == policy->cur) return 0; /* Save last value to restore later on errors */ restore_freq = policy->cur; notify = !(cpufreq_driver->flags & CPUFREQ_ASYNC_NOTIFICATION); if (notify) { /* Handle switching to intermediate frequency */ if (cpufreq_driver->get_intermediate) { retval = __target_intermediate(policy, &freqs, index); if (retval) return retval; intermediate_freq = freqs.new; /* Set old freq to intermediate */ if (intermediate_freq) freqs.old = freqs.new; } freqs.new = newfreq; pr_debug("%s: cpu: %d, oldfreq: %u, new freq: %u\n", __func__, policy->cpu, freqs.old, freqs.new); cpufreq_freq_transition_begin(policy, &freqs); } retval = cpufreq_driver->target_index(policy, index); if (retval) pr_err("%s: Failed to change cpu frequency: %d\n", __func__, retval); if (notify) { cpufreq_freq_transition_end(policy, &freqs, retval); /* * Failed after setting to intermediate freq? Driver should have * reverted back to initial frequency and so should we. Check * here for intermediate_freq instead of get_intermediate, in * case we haven't switched to intermediate freq at all. */ if (unlikely(retval && intermediate_freq)) { freqs.old = intermediate_freq; freqs.new = restore_freq; cpufreq_freq_transition_begin(policy, &freqs); cpufreq_freq_transition_end(policy, &freqs, 0); } } return retval; } int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { unsigned int old_target_freq = target_freq; if (cpufreq_disabled()) return -ENODEV; target_freq = __resolve_freq(policy, target_freq, policy->min, policy->max, relation); pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n", policy->cpu, target_freq, relation, old_target_freq); /* * This might look like a redundant call as we are checking it again * after finding index. But it is left intentionally for cases where * exactly same freq is called again and so we can save on few function * calls. */ if (target_freq == policy->cur && !(cpufreq_driver->flags & CPUFREQ_NEED_UPDATE_LIMITS)) return 0; if (cpufreq_driver->target) { /* * If the driver hasn't setup a single inefficient frequency, * it's unlikely it knows how to decode CPUFREQ_RELATION_E. */ if (!policy->efficiencies_available) relation &= ~CPUFREQ_RELATION_E; return cpufreq_driver->target(policy, target_freq, relation); } if (!cpufreq_driver->target_index) return -EINVAL; return __target_index(policy, policy->cached_resolved_idx); } EXPORT_SYMBOL_GPL(__cpufreq_driver_target); int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { guard(cpufreq_policy_write)(policy); return __cpufreq_driver_target(policy, target_freq, relation); } EXPORT_SYMBOL_GPL(cpufreq_driver_target); __weak struct cpufreq_governor *cpufreq_fallback_governor(void) { return NULL; } static int cpufreq_init_governor(struct cpufreq_policy *policy) { int ret; /* Don't start any governor operations if we are entering suspend */ if (cpufreq_suspended) return 0; /* * Governor might not be initiated here if ACPI _PPC changed * notification happened, so check it. */ if (!policy->governor) return -EINVAL; /* Platform doesn't want dynamic frequency switching ? */ if (policy->governor->flags & CPUFREQ_GOV_DYNAMIC_SWITCHING && cpufreq_driver->flags & CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING) { struct cpufreq_governor *gov = cpufreq_fallback_governor(); if (gov) { pr_warn("Can't use %s governor as dynamic switching is disallowed. Fallback to %s governor\n", policy->governor->name, gov->name); policy->governor = gov; } else { return -EINVAL; } } if (!try_module_get(policy->governor->owner)) return -EINVAL; pr_debug("%s: for CPU %u\n", __func__, policy->cpu); if (policy->governor->init) { ret = policy->governor->init(policy); if (ret) { module_put(policy->governor->owner); return ret; } } policy->strict_target = !!(policy->governor->flags & CPUFREQ_GOV_STRICT_TARGET); return 0; } static void cpufreq_exit_governor(struct cpufreq_policy *policy) { if (cpufreq_suspended || !policy->governor) return; pr_debug("%s: for CPU %u\n", __func__, policy->cpu); if (policy->governor->exit) policy->governor->exit(policy); module_put(policy->governor->owner); } int cpufreq_start_governor(struct cpufreq_policy *policy) { int ret; if (cpufreq_suspended) return 0; if (!policy->governor) return -EINVAL; pr_debug("%s: for CPU %u\n", __func__, policy->cpu); cpufreq_verify_current_freq(policy, false); if (policy->governor->start) { ret = policy->governor->start(policy); if (ret) return ret; } if (policy->governor->limits) policy->governor->limits(policy); return 0; } void cpufreq_stop_governor(struct cpufreq_policy *policy) { if (cpufreq_suspended || !policy->governor) return; pr_debug("%s: for CPU %u\n", __func__, policy->cpu); if (policy->governor->stop) policy->governor->stop(policy); } static void cpufreq_governor_limits(struct cpufreq_policy *policy) { if (cpufreq_suspended || !policy->governor) return; pr_debug("%s: for CPU %u\n", __func__, policy->cpu); if (policy->governor->limits) policy->governor->limits(policy); } int cpufreq_register_governor(struct cpufreq_governor *governor) { int err; if (!governor) return -EINVAL; if (cpufreq_disabled()) return -ENODEV; mutex_lock(&cpufreq_governor_mutex); err = -EBUSY; if (!find_governor(governor->name)) { err = 0; list_add(&governor->governor_list, &cpufreq_governor_list); } mutex_unlock(&cpufreq_governor_mutex); return err; } EXPORT_SYMBOL_GPL(cpufreq_register_governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor) { struct cpufreq_policy *policy; unsigned long flags; if (!governor) return; if (cpufreq_disabled()) return; /* clear last_governor for all inactive policies */ read_lock_irqsave(&cpufreq_driver_lock, flags); for_each_inactive_policy(policy) { if (!strcmp(policy->last_governor, governor->name)) { policy->governor = NULL; strcpy(policy->last_governor, "\0"); } } read_unlock_irqrestore(&cpufreq_driver_lock, flags); mutex_lock(&cpufreq_governor_mutex); list_del(&governor->governor_list); mutex_unlock(&cpufreq_governor_mutex); } EXPORT_SYMBOL_GPL(cpufreq_unregister_governor); /********************************************************************* * POLICY INTERFACE * *********************************************************************/ DEFINE_PER_CPU(unsigned long, cpufreq_pressure); /** * cpufreq_update_pressure() - Update cpufreq pressure for CPUs * @policy: cpufreq policy of the CPUs. * * Update the value of cpufreq pressure for all @cpus in the policy. */ static void cpufreq_update_pressure(struct cpufreq_policy *policy) { unsigned long max_capacity, capped_freq, pressure; u32 max_freq; int cpu; cpu = cpumask_first(policy->related_cpus); max_freq = arch_scale_freq_ref(cpu); capped_freq = policy->max; /* * Handle properly the boost frequencies, which should simply clean * the cpufreq pressure value. */ if (max_freq <= capped_freq) { pressure = 0; } else { max_capacity = arch_scale_cpu_capacity(cpu); pressure = max_capacity - mult_frac(max_capacity, capped_freq, max_freq); } for_each_cpu(cpu, policy->related_cpus) WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure); } /** * cpufreq_set_policy - Modify cpufreq policy parameters. * @policy: Policy object to modify. * @new_gov: Policy governor pointer. * @new_pol: Policy value (for drivers with built-in governors). * * Invoke the cpufreq driver's ->verify() callback to sanity-check the frequency * limits to be set for the policy, update @policy with the verified limits * values and either invoke the driver's ->setpolicy() callback (if present) or * carry out a governor update for @policy. That is, run the current governor's * ->limits() callback (if @new_gov points to the same object as the one in * @policy) or replace the governor for @policy with @new_gov. * * The cpuinfo part of @policy is not updated by this function. */ static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_governor *new_gov, unsigned int new_pol) { struct cpufreq_policy_data new_data; struct cpufreq_governor *old_gov; int ret; memcpy(&new_data.cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo)); new_data.freq_table = policy->freq_table; new_data.cpu = policy->cpu; /* * PM QoS framework collects all the requests from users and provide us * the final aggregated value here. */ new_data.min = freq_qos_read_value(&policy->constraints, FREQ_QOS_MIN); new_data.max = freq_qos_read_value(&policy->constraints, FREQ_QOS_MAX); pr_debug("setting new policy for CPU %u: %u - %u kHz\n", new_data.cpu, new_data.min, new_data.max); /* * Verify that the CPU speed can be set within these limits and make sure * that min <= max. */ ret = cpufreq_driver->verify(&new_data); if (ret) return ret; /* * Resolve policy min/max to available frequencies. It ensures * no frequency resolution will neither overshoot the requested maximum * nor undershoot the requested minimum. * * Avoid storing intermediate values in policy->max or policy->min and * compiler optimizations around them because they may be accessed * concurrently by cpufreq_driver_resolve_freq() during the update. */ WRITE_ONCE(policy->max, __resolve_freq(policy, new_data.max, new_data.min, new_data.max, CPUFREQ_RELATION_H)); new_data.min = __resolve_freq(policy, new_data.min, new_data.min, new_data.max, CPUFREQ_RELATION_L); WRITE_ONCE(policy->min, new_data.min > policy->max ? policy->max : new_data.min); trace_cpu_frequency_limits(policy); cpufreq_update_pressure(policy); policy->cached_target_freq = UINT_MAX; pr_debug("new min and max freqs are %u - %u kHz\n", policy->min, policy->max); if (cpufreq_driver->setpolicy) { policy->policy = new_pol; pr_debug("setting range\n"); return cpufreq_driver->setpolicy(policy); } if (new_gov == policy->governor) { pr_debug("governor limits update\n"); cpufreq_governor_limits(policy); return 0; } pr_debug("governor switch\n"); /* save old, working values */ old_gov = policy->governor; /* end old governor */ if (old_gov) { cpufreq_stop_governor(policy); cpufreq_exit_governor(policy); } /* start new governor */ policy->governor = new_gov; ret = cpufreq_init_governor(policy); if (!ret) { ret = cpufreq_start_governor(policy); if (!ret) { pr_debug("governor change\n"); return 0; } cpufreq_exit_governor(policy); } /* new governor failed, so re-start old one */ pr_debug("starting governor %s failed\n", policy->governor->name); if (old_gov) { policy->governor = old_gov; if (cpufreq_init_governor(policy)) { policy->governor = NULL; } else if (cpufreq_start_governor(policy)) { cpufreq_exit_governor(policy); policy->governor = NULL; } } return ret; } static void cpufreq_policy_refresh(struct cpufreq_policy *policy) { guard(cpufreq_policy_write)(policy); /* * BIOS might change freq behind our back * -> ask driver for current freq and notify governors about a change */ if (cpufreq_driver->get && has_target() && (cpufreq_suspended || WARN_ON(!cpufreq_verify_current_freq(policy, false)))) return; refresh_frequency_limits(policy); } /** * cpufreq_update_policy - Re-evaluate an existing cpufreq policy. * @cpu: CPU to re-evaluate the policy for. * * Update the current frequency for the cpufreq policy of @cpu and use * cpufreq_set_policy() to re-apply the min and max limits, which triggers the * evaluation of policy notifiers and the cpufreq driver's ->verify() callback * for the policy in question, among other things. */ void cpufreq_update_policy(unsigned int cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy); policy = cpufreq_cpu_get(cpu); if (!policy) return; cpufreq_policy_refresh(policy); } EXPORT_SYMBOL(cpufreq_update_policy); /** * cpufreq_update_limits - Update policy limits for a given CPU. * @cpu: CPU to update the policy limits for. * * Invoke the driver's ->update_limits callback if present or call * cpufreq_policy_refresh() for @cpu. */ void cpufreq_update_limits(unsigned int cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy); policy = cpufreq_cpu_get(cpu); if (!policy) return; if (cpufreq_driver->update_limits) cpufreq_driver->update_limits(policy); else cpufreq_policy_refresh(policy); } EXPORT_SYMBOL_GPL(cpufreq_update_limits); /********************************************************************* * BOOST * *********************************************************************/ int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { int ret; if (!policy->freq_table) return -ENXIO; ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table); if (ret) { pr_err("%s: Policy frequency update failed\n", __func__); return ret; } ret = freq_qos_update_request(policy->max_freq_req, policy->max); if (ret < 0) return ret; return 0; } EXPORT_SYMBOL_GPL(cpufreq_boost_set_sw); static int cpufreq_boost_trigger_state(int state) { struct cpufreq_policy *policy; unsigned long flags; int ret = 0; /* * Don't compare 'cpufreq_driver->boost_enabled' with 'state' here to * make sure all policies are in sync with global boost flag. */ write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver->boost_enabled = state; write_unlock_irqrestore(&cpufreq_driver_lock, flags); cpus_read_lock(); for_each_active_policy(policy) { if (!policy->boost_supported) continue; ret = policy_set_boost(policy, state); if (ret) goto err_reset_state; } cpus_read_unlock(); return 0; err_reset_state: cpus_read_unlock(); write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver->boost_enabled = !state; write_unlock_irqrestore(&cpufreq_driver_lock, flags); pr_err("%s: Cannot %s BOOST\n", __func__, str_enable_disable(state)); return ret; } static bool cpufreq_boost_supported(void) { return cpufreq_driver->set_boost; } static int create_boost_sysfs_file(void) { int ret; ret = sysfs_create_file(cpufreq_global_kobject, &boost.attr); if (ret) pr_err("%s: cannot register global BOOST sysfs file\n", __func__); return ret; } static void remove_boost_sysfs_file(void) { if (cpufreq_boost_supported()) sysfs_remove_file(cpufreq_global_kobject, &boost.attr); } bool cpufreq_boost_enabled(void) { return cpufreq_driver->boost_enabled; } EXPORT_SYMBOL_GPL(cpufreq_boost_enabled); /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * *********************************************************************/ static enum cpuhp_state hp_online; static int cpuhp_cpufreq_online(unsigned int cpu) { cpufreq_online(cpu); return 0; } static int cpuhp_cpufreq_offline(unsigned int cpu) { cpufreq_offline(cpu); return 0; } /** * cpufreq_register_driver - register a CPU Frequency driver * @driver_data: A struct cpufreq_driver containing the values# * submitted by the CPU Frequency driver. * * Registers a CPU Frequency driver to this core code. This code * returns zero on success, -EEXIST when another driver got here first * (and isn't unregistered in the meantime). * */ int cpufreq_register_driver(struct cpufreq_driver *driver_data) { unsigned long flags; int ret; if (cpufreq_disabled()) return -ENODEV; /* * The cpufreq core depends heavily on the availability of device * structure, make sure they are available before proceeding further. */ if (!get_cpu_device(0)) return -EPROBE_DEFER; if (!driver_data || !driver_data->verify || !driver_data->init || !(driver_data->setpolicy || driver_data->target_index || driver_data->target) || (driver_data->setpolicy && (driver_data->target_index || driver_data->target)) || (!driver_data->get_intermediate != !driver_data->target_intermediate) || (!driver_data->online != !driver_data->offline) || (driver_data->adjust_perf && !driver_data->fast_switch)) return -EINVAL; pr_debug("trying to register driver %s\n", driver_data->name); /* Protect against concurrent CPU online/offline. */ cpus_read_lock(); write_lock_irqsave(&cpufreq_driver_lock, flags); if (cpufreq_driver) { write_unlock_irqrestore(&cpufreq_driver_lock, flags); ret = -EEXIST; goto out; } cpufreq_driver = driver_data; write_unlock_irqrestore(&cpufreq_driver_lock, flags); if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; if (cpufreq_boost_supported()) { ret = create_boost_sysfs_file(); if (ret) goto err_null_driver; } ret = subsys_interface_register(&cpufreq_interface); if (ret) goto err_boost_unreg; if (unlikely(list_empty(&cpufreq_policy_list))) { /* if all ->init() calls failed, unregister */ ret = -ENODEV; pr_debug("%s: No CPU initialized for driver %s\n", __func__, driver_data->name); goto err_if_unreg; } ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "cpufreq:online", cpuhp_cpufreq_online, cpuhp_cpufreq_offline); if (ret < 0) goto err_if_unreg; hp_online = ret; ret = 0; /* * Mark support for the scheduler's frequency invariance engine for * drivers that implement target(), target_index() or fast_switch(). */ if (!cpufreq_driver->setpolicy) { static_branch_enable_cpuslocked(&cpufreq_freq_invariance); pr_debug("supports frequency invariance"); } pr_debug("driver %s up and running\n", driver_data->name); goto out; err_if_unreg: subsys_interface_unregister(&cpufreq_interface); err_boost_unreg: remove_boost_sysfs_file(); err_null_driver: write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); out: cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(cpufreq_register_driver); /* * cpufreq_unregister_driver - unregister the current CPUFreq driver * * Unregister the current CPUFreq driver. Only call this if you have * the right to do so, i.e. if you have succeeded in initialising before! * Returns zero if successful, and -EINVAL if the cpufreq_driver is * currently not initialised. */ void cpufreq_unregister_driver(struct cpufreq_driver *driver) { unsigned long flags; if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver))) return; pr_debug("unregistering driver %s\n", driver->name); /* Protect against concurrent cpu hotplug */ cpus_read_lock(); subsys_interface_unregister(&cpufreq_interface); remove_boost_sysfs_file(); static_branch_disable_cpuslocked(&cpufreq_freq_invariance); cpuhp_remove_state_nocalls_cpuslocked(hp_online); write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); static int __init cpufreq_core_init(void) { struct cpufreq_governor *gov = cpufreq_default_governor(); struct device *dev_root; if (cpufreq_disabled()) return -ENODEV; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { cpufreq_global_kobject = kobject_create_and_add("cpufreq", &dev_root->kobj); put_device(dev_root); } BUG_ON(!cpufreq_global_kobject); if (!strlen(default_governor)) strscpy(default_governor, gov->name, CPUFREQ_NAME_LEN); return 0; } static bool cpufreq_policy_is_good_for_eas(unsigned int cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy); policy = cpufreq_cpu_get(cpu); if (!policy) { pr_debug("cpufreq policy not set for CPU: %d\n", cpu); return false; } return sugov_is_governor(policy); } bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask) { unsigned int cpu; /* Do not attempt EAS if schedutil is not being used. */ for_each_cpu(cpu, cpu_mask) { if (!cpufreq_policy_is_good_for_eas(cpu)) { pr_debug("rd %*pbl: schedutil is mandatory for EAS\n", cpumask_pr_args(cpu_mask)); return false; } } return true; } module_param(off, int, 0444); module_param_string(default_governor, default_governor, CPUFREQ_NAME_LEN, 0444); core_initcall(cpufreq_core_init); |
| 112 112 111 5 5 5 5 5 5 5 222 410 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp stream message interleaving, mostly * including I-DATA and I-FORWARD-TSN chunks process. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <net/busy_poll.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/ulpevent.h> #include <linux/sctp.h> static struct sctp_chunk *sctp_make_idatafrag_empty( const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_idatahdr dp; memset(&dp, 0, sizeof(dp)); dp.stream = htons(sinfo->sinfo_stream); if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; __u32 cfsn = 0; __u16 sid; if (chunk->has_mid) return; sid = sctp_chunk_stream_no(chunk); stream = &chunk->asoc->stream; list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { struct sctp_idatahdr *hdr; __u32 mid; lchunk->has_mid = 1; hdr = lchunk->subh.idata_hdr; if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) hdr->ppid = lchunk->sinfo.sinfo_ppid; else hdr->fsn = htonl(cfsn++); if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_uo_next(stream, out, sid) : sctp_mid_uo_peek(stream, out, sid); } else { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_next(stream, out, sid) : sctp_mid_peek(stream, out, sid); } hdr->mid = htonl(mid); } } static bool sctp_validate_data(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u16 sid, ssn; if (chunk->chunk_hdr->type != SCTP_CID_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); ssn = ntohs(chunk->subh.data_hdr->ssn); return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); } static bool sctp_validate_idata(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u32 mid; __u16 sid; if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); mid = ntohl(chunk->subh.idata_hdr->mid); return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); } static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) { loc = pos; break; } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); else __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream || cevent->mid != sin->mid) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (cevent->mid == sin->mid) { pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled(ulpq, event); return retval; } static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } cevent = (struct sctp_ulpevent *)pos->cb; if (event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if (event->stream > cevent->stream) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > event->stream) { loc = pos; break; } if (cevent->stream == event->stream && MID_lt(event->mid, cevent->mid)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); else __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sctp_stream *stream; struct sk_buff *pos, *tmp; __u16 sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > sid) break; if (cevent->stream < sid) continue; if (cevent->mid != sctp_mid_peek(stream, in, sid)) break; sctp_mid_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); __skb_queue_tail(event_list, pos); } } static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_stream *stream; __u16 sid; stream = &ulpq->asoc->stream; sid = event->stream; if (event->mid != sctp_mid_peek(stream, in, sid)) { sctp_intl_store_ordered(ulpq, event); return NULL; } sctp_mid_next(stream, in, sid); sctp_intl_retrieve_ordered(ulpq, event); return event; } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff *skb; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; skb_queue_splice_tail_init(skb_list, &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos; pos = skb_peek_tail(&ulpq->reasm_uo); if (!pos) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } skb_queue_walk(&ulpq->reasm_uo, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) break; if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) break; } __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, sin->mid_uo)) continue; if (MID_lt(sin->mid_uo, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode_uo = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!sin->pd_mode_uo) { sin->mid_uo = cevent->mid; pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm_uo(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; sin->mid_uo = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid_uo && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } return retval; } static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; int event_eor = 0; event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->mid = ntohl(chunk->subh.idata_hdr->mid); if (event->msg_flags & SCTP_DATA_FIRST_FRAG) event->ppid = chunk->subh.idata_hdr->ppid; else event->fsn = ntohl(chunk->subh.idata_hdr->fsn); if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_enqueue_event(ulpq, &temp); } return event_eor; } static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; if (cevent->mid == csin->mid) { first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; } break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } return retval; } static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } } static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, needed); } if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u16 flags, gfp_t gfp) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { struct sctp_sock *sp = sctp_sk(sk); __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } } static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sctp_stream *stream = &ulpq->asoc->stream; struct sctp_ulpevent *cevent, *event = NULL; struct sk_buff_head *lobby = &ulpq->lobby; struct sk_buff *pos, *tmp; struct sk_buff_head temp; __u16 csid; __u32 cmid; skb_queue_head_init(&temp); sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid > sid) break; if (csid < sid) continue; if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) event = sctp_skb2event(pos); __skb_queue_tail(&temp, pos); } if (!event && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { sctp_mid_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } if (event) { sctp_intl_retrieve_ordered(ulpq, event); sctp_enqueue_event(ulpq, &temp); } } static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_stream *stream = &ulpq->asoc->stream; __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { sin->pd_mode_uo = 0; mid = sin->mid_uo; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); } if (sin->pd_mode) { sin->pd_mode = 0; mid = sin->mid; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } } /* intl abort pd happens only when all data needs to be cleaned */ sctp_ulpq_flush(ulpq); } static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, int nskips, __be16 stream, __u8 flags) { int i; for (i = 0; i < nskips; i++) if (skiplist[i].stream == stream && skiplist[i].flags == flags) return i; return i; } #define SCTP_FTSN_U_BIT 0x1 static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct list_head *lchunk, *temp; int nskips = 0, skip_pos; struct sctp_chunk *chunk; __u32 tsn; if (!asoc->peer.prsctp_capable) return; if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { __be16 sid = chunk->subh.idata_hdr->stream; __be32 mid = chunk->subh.idata_hdr->mid; __u8 flags = 0; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) flags |= SCTP_FTSN_U_BIT; asoc->adv_peer_ack_point = tsn; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, sid, flags); ftsn_skip_arr[skip_pos].stream = sid; ftsn_skip_arr[skip_pos].reserved = 0; ftsn_skip_arr[skip_pos].flags = flags; ftsn_skip_arr[skip_pos].mid = mid; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else { break; } } if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \ sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ sizeof(struct sctp_ifwdtsn_chunk)) static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_fwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static bool sctp_validate_iftsn(struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_ifwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_ulpq_reasm_flushtsn(ulpq, ftsn); /* Abort any in progress partial delivery. */ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { struct sk_buff *pos, *tmp; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } } skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm_uo); sctp_ulpevent_free(event); } } } static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_intl_reasm_flushtsn(ulpq, ftsn); /* abort only when it's for all data */ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) sctp_intl_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; /* Walk through all the skipped SSNs */ sctp_walk_fwdtsn(skip, chunk) sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); } static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { sin->pd_mode_uo = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, GFP_ATOMIC); } return; } if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) return; if (sin->pd_mode) { sin->pd_mode = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); } sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; /* Walk through all the skipped MIDs and abort stream pd if possible */ sctp_walk_ifwdtsn(skip, chunk) sctp_intl_skip(ulpq, ntohs(skip->stream), ntohl(skip->mid), skip->flags); } static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_ulpq_tail_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), /* DATA process functions */ .make_datafrag = sctp_make_datafrag_empty, .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, /* FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_fwdtsn, .validate_ftsn = sctp_validate_fwdtsn, .report_ftsn = sctp_report_fwdtsn, .handle_ftsn = sctp_handle_fwdtsn, }; static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_enqueue_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), /* I-DATA process functions */ .make_datafrag = sctp_make_idatafrag_empty, .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, /* I-FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_iftsn, .validate_ftsn = sctp_validate_iftsn, .report_ftsn = sctp_report_iftsn, .handle_ftsn = sctp_handle_iftsn, }; void sctp_stream_interleave_init(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 : &sctp_stream_interleave_0; } |
| 5 5 1 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0-or-later /* * connector.c * * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net> * All rights reserved. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <linux/moduleparam.h> #include <linux/connector.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector."); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_CONNECTOR); static struct cn_dev cdev; static int cn_already_initialized; /* * Sends mult (multiple) cn_msg at a time. * * msg->seq and msg->ack are used to determine message genealogy. * When someone sends message it puts there locally unique sequence * and random acknowledge numbers. Sequence number may be copied into * nlmsghdr->nlmsg_seq too. * * Sequence number is incremented with each message to be sent. * * If we expect a reply to our message then the sequence number in * received message MUST be the same as in original message, and * acknowledge number MUST be the same + 1. * * If we receive a message and its sequence number is not equal to the * one we are expecting then it is a new message. * * If we receive a message and its sequence number is the same as one * we are expecting but it's acknowledgement number is not equal to * the acknowledgement number in the original message + 1, then it is * a new message. * * If msg->len != len, then additional cn_msg messages are expected following * the first msg. * * The message is sent to, the portid if given, the group if given, both if * both, or if both are zero then the group is looked up and sent there. */ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, gfp_t gfp_mask, netlink_filter_fn filter, void *filter_data) { struct cn_callback_entry *__cbq; unsigned int size; struct sk_buff *skb; struct nlmsghdr *nlh; struct cn_msg *data; struct cn_dev *dev = &cdev; u32 group = 0; int found = 0; if (portid || __group) { group = __group; } else { spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(__cbq, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&__cbq->id.id, &msg->id)) { found = 1; group = __cbq->group; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (!found) return -ENODEV; } if (!portid && !netlink_has_listeners(dev->nls, group)) return -ESRCH; size = sizeof(*msg) + len; skb = nlmsg_new(size, gfp_mask); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, msg->seq, NLMSG_DONE, size, 0); if (!nlh) { kfree_skb(skb); return -EMSGSIZE; } data = nlmsg_data(nlh); memcpy(data, msg, size); NETLINK_CB(skb).dst_group = group; if (group) return netlink_broadcast_filtered(dev->nls, skb, portid, group, gfp_mask, filter, (void *)filter_data); return netlink_unicast(dev->nls, skb, portid, !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); /* same as cn_netlink_send_mult except msg->len is used for len */ int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, gfp_t gfp_mask) { return cn_netlink_send_mult(msg, msg->len, portid, __group, gfp_mask, NULL, NULL); } EXPORT_SYMBOL_GPL(cn_netlink_send); /* * Callback helper - queues work and setup destructor for given data. */ static int cn_call_callback(struct sk_buff *skb) { struct nlmsghdr *nlh; struct cn_callback_entry *i, *cbq = NULL; struct cn_dev *dev = &cdev; struct cn_msg *msg = nlmsg_data(nlmsg_hdr(skb)); struct netlink_skb_parms *nsp = &NETLINK_CB(skb); int err = -ENODEV; /* verify msg->len is within skb */ nlh = nlmsg_hdr(skb); if (nlh->nlmsg_len < NLMSG_HDRLEN + sizeof(struct cn_msg) + msg->len) return -EINVAL; spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(i, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&i->id.id, &msg->id)) { refcount_inc(&i->refcnt); cbq = i; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (cbq != NULL) { cbq->callback(msg, nsp); kfree_skb(skb); cn_queue_release_callback(cbq); err = 0; } return err; } /* * Allow non-root access for NETLINK_CONNECTOR family having CN_IDX_PROC * multicast group. */ static int cn_bind(struct net *net, int group) { unsigned long groups = (unsigned long) group; if (ns_capable(net->user_ns, CAP_NET_ADMIN)) return 0; if (test_bit(CN_IDX_PROC - 1, &groups)) return 0; return -EPERM; } static void cn_release(struct sock *sk, unsigned long *groups) { if (groups && test_bit(CN_IDX_PROC - 1, groups)) { kfree(sk->sk_user_data); sk->sk_user_data = NULL; } } /* * Main netlink receiving function. * * It checks skb, netlink header and msg sizes, and calls callback helper. */ static void cn_rx_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; int len, err; if (skb->len >= NLMSG_HDRLEN) { nlh = nlmsg_hdr(skb); len = nlmsg_len(nlh); if (len < (int)sizeof(struct cn_msg) || skb->len < nlh->nlmsg_len || len > CONNECTOR_MAX_MSG_SIZE) return; err = cn_call_callback(skb_get(skb)); if (err < 0) kfree_skb(skb); } } /* * Callback add routing - adds callback with given ID and name. * If there is registered callback with the same ID it will not be added. * * May sleep. */ int cn_add_callback(const struct cb_id *id, const char *name, void (*callback)(struct cn_msg *, struct netlink_skb_parms *)) { struct cn_dev *dev = &cdev; if (!cn_already_initialized) return -EAGAIN; return cn_queue_add_callback(dev->cbdev, name, id, callback); } EXPORT_SYMBOL_GPL(cn_add_callback); /* * Callback remove routing - removes callback * with given ID. * If there is no registered callback with given * ID nothing happens. * * May sleep while waiting for reference counter to become zero. */ void cn_del_callback(const struct cb_id *id) { struct cn_dev *dev = &cdev; cn_queue_del_callback(dev->cbdev, id); } EXPORT_SYMBOL_GPL(cn_del_callback); static int __maybe_unused cn_proc_show(struct seq_file *m, void *v) { struct cn_queue_dev *dev = cdev.cbdev; struct cn_callback_entry *cbq; seq_printf(m, "Name ID\n"); spin_lock_bh(&dev->queue_lock); list_for_each_entry(cbq, &dev->queue_list, callback_entry) { seq_printf(m, "%-15s %u:%u\n", cbq->id.name, cbq->id.id.idx, cbq->id.id.val); } spin_unlock_bh(&dev->queue_lock); return 0; } static int cn_init(void) { struct cn_dev *dev = &cdev; struct netlink_kernel_cfg cfg = { .groups = CN_NETLINK_USERS + 0xf, .input = cn_rx_skb, .flags = NL_CFG_F_NONROOT_RECV, .bind = cn_bind, .release = cn_release, }; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg); if (!dev->nls) return -EIO; dev->cbdev = cn_queue_alloc_dev("cqueue", dev->nls); if (!dev->cbdev) { netlink_kernel_release(dev->nls); return -EINVAL; } cn_already_initialized = 1; proc_create_single("connector", S_IRUGO, init_net.proc_net, cn_proc_show); return 0; } static void cn_fini(void) { struct cn_dev *dev = &cdev; cn_already_initialized = 0; remove_proc_entry("connector", init_net.proc_net); cn_queue_free_dev(dev->cbdev); netlink_kernel_release(dev->nls); } subsys_initcall(cn_init); module_exit(cn_fini); |
| 30 22 32 32 12 12 12 6 6 29 29 29 37 3 37 18 18 16 18 12 11 11 7 4 9 2 2 8 4 8 2 2 2 2 1 7 12 12 12 8 8 26 26 23 22 22 20 15 18 3 2 12 11 1 10 2 10 10 10 12 11 12 12 15 18 8 11 6 5 4 1 4 3 4 8 11 2 3 2 2 3 8 7 5 3 2 5 7 8 5 4 3 2 3 5 22 22 10 8 7 7 7 6 6 8 22 3 2 1 6 5 4 3 2 9 8 7 6 6 3 2 5 4 6 9 7 5 6 1 1 4 2 4 2 3 3 2 3 3 7 37 37 37 14 11 13 14 12 36 37 11 2 2 1 2 33 4 33 2 2 2 1 1 2 2 2 1 33 32 12 12 12 7 12 37 1 1 37 37 36 36 8 29 29 29 1 10 29 29 20 20 5 5 30 29 45 42 39 41 19 17 16 16 15 15 12 2 26 6 37 45 15 15 15 15 15 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 | // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f; int nfilter; spin_lock_bh(&jsk->filters_lock); f = jsk->filters; nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; goto filter_match_found; } spin_unlock_bh(&jsk->filters_lock); return false; filter_match_found: spin_unlock_bh(&jsk->filters_lock); return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; enum skb_drop_reason reason; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0) sk_skb_reason_drop(&jsk->sk, skb, reason); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } read_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data) + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority)); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; u32 tsflags; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } while (todo_size); switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } read_unlock_bh(&priv->j1939_socks_lock); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, }; |
| 11 11 19 338 340 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> #include <asm/cpufeature.h> #include <asm/msr.h> /* * Define bits that are always set to 1 in DR7, only bit 10 is * architecturally reserved to '1'. * * This is also the init/reset value for DR7. */ #define DR7_FIXED_1 0x00000400 DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ #define get_debugreg(var, register) \ (var) = native_get_debugreg(register) #define set_debugreg(value, register) \ native_set_debugreg(register, value) #endif static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val; switch (regno) { case 0: asm("mov %%db0, %0" :"=r" (val)); break; case 1: asm("mov %%db1, %0" :"=r" (val)); break; case 2: asm("mov %%db2, %0" :"=r" (val)); break; case 3: asm("mov %%db3, %0" :"=r" (val)); break; case 6: asm("mov %%db6, %0" :"=r" (val)); break; case 7: /* * Use "asm volatile" for DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception * when running under SEV-ES. Taking a #VC exception is not a * safe thing to do just anywhere in the entry code and * re-ordering might place the access into an unsafe location. * * This happened in the NMI handler, where the DR7 read was * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ asm volatile("mov %%db7, %0" : "=r" (val)); break; default: BUG(); } return val; } static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: asm("mov %0, %%db0" ::"r" (value)); break; case 1: asm("mov %0, %%db1" ::"r" (value)); break; case 2: asm("mov %0, %%db2" ::"r" (value)); break; case 3: asm("mov %0, %%db3" ::"r" (value)); break; case 6: asm("mov %0, %%db6" ::"r" (value)); break; case 7: /* * Use "asm volatile" for DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the * "asm volatile" here too to avoid similar problems in the * future. */ asm volatile("mov %0, %%db7" ::"r" (value)); break; default: BUG(); } } static inline void hw_breakpoint_disable(void) { /* Reset the control register for HW Breakpoint */ set_debugreg(DR7_FIXED_1, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); set_debugreg(0UL, 1); set_debugreg(0UL, 2); set_debugreg(0UL, 3); } static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) { unsigned long dr7; if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) return 0; get_debugreg(dr7, 7); /* Architecturally set bit */ dr7 &= ~DR7_FIXED_1; if (dr7) set_debugreg(DR7_FIXED_1, 7); /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not * be good. */ barrier(); return dr7; } static __always_inline void local_db_restore(unsigned long dr7) { /* * Ensure the compiler doesn't raise this statement into * the critical section; enabling breakpoints early would * not be good. */ barrier(); if (dr7) set_debugreg(dr7, 7); } #ifdef CONFIG_CPU_SUP_AMD extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) { return 0; } #endif static inline unsigned long get_debugctlmsr(void) { unsigned long debugctlmsr = 0; #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return 0; #endif rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); return debugctlmsr; } static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return; #endif wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } #endif /* _ASM_X86_DEBUGREG_H */ |
| 455 5 3 2 1 1 1 1 5 3 2 1 1 1 455 453 455 63 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/bits.h> #include <linux/bitfield.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/xarray.h> #include <net/devlink.h> #include <net/net_shaper.h> #include "shaper_nl_gen.h" #include "../core/dev.h" #define NET_SHAPER_SCOPE_SHIFT 26 #define NET_SHAPER_ID_MASK GENMASK(NET_SHAPER_SCOPE_SHIFT - 1, 0) #define NET_SHAPER_SCOPE_MASK GENMASK(31, NET_SHAPER_SCOPE_SHIFT) #define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK struct net_shaper_hierarchy { struct xarray shapers; }; struct net_shaper_nl_ctx { struct net_shaper_binding binding; netdevice_tracker dev_tracker; unsigned long start_index; }; static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx) { return &((struct net_shaper_nl_ctx *)ctx)->binding; } static void net_shaper_lock(struct net_shaper_binding *binding) { switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: netdev_lock(binding->netdev); break; } } static void net_shaper_unlock(struct net_shaper_binding *binding) { switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: netdev_unlock(binding->netdev); break; } } static struct net_shaper_hierarchy * net_shaper_hierarchy(struct net_shaper_binding *binding) { /* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */ if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) return READ_ONCE(binding->netdev->net_shaper_hierarchy); /* No other type supported yet. */ return NULL; } static const struct net_shaper_ops * net_shaper_ops(struct net_shaper_binding *binding) { if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) return binding->netdev->netdev_ops->net_shaper_ops; /* No other type supported yet. */ return NULL; } /* Count the number of [multi] attributes of the given type. */ static int net_shaper_list_len(struct genl_info *info, int type) { struct nlattr *attr; int rem, cnt = 0; nla_for_each_attr_type(attr, type, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) cnt++; return cnt; } static int net_shaper_handle_size(void) { return nla_total_size(nla_total_size(sizeof(u32)) + nla_total_size(sizeof(u32))); } static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) { /* Should never happen, as currently only NETDEV is supported. */ if (WARN_ON_ONCE(binding->type != NET_SHAPER_BINDING_TYPE_NETDEV)) return -EINVAL; if (nla_put_u32(msg, type, binding->netdev->ifindex)) return -EMSGSIZE; return 0; } static int net_shaper_fill_handle(struct sk_buff *msg, const struct net_shaper_handle *handle, u32 type) { struct nlattr *handle_attr; if (handle->scope == NET_SHAPER_SCOPE_UNSPEC) return 0; handle_attr = nla_nest_start(msg, type); if (!handle_attr) return -EMSGSIZE; if (nla_put_u32(msg, NET_SHAPER_A_HANDLE_SCOPE, handle->scope) || (handle->scope >= NET_SHAPER_SCOPE_QUEUE && nla_put_u32(msg, NET_SHAPER_A_HANDLE_ID, handle->id))) goto handle_nest_cancel; nla_nest_end(msg, handle_attr); return 0; handle_nest_cancel: nla_nest_cancel(msg, handle_attr); return -EMSGSIZE; } static int net_shaper_fill_one(struct sk_buff *msg, const struct net_shaper_binding *binding, const struct net_shaper *shaper, const struct genl_info *info) { void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) return -EMSGSIZE; if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || net_shaper_fill_handle(msg, &shaper->parent, NET_SHAPER_A_PARENT) || net_shaper_fill_handle(msg, &shaper->handle, NET_SHAPER_A_HANDLE) || ((shaper->bw_min || shaper->bw_max || shaper->burst) && nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) || (shaper->bw_min && nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) || (shaper->bw_max && nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) || (shaper->burst && nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) || (shaper->priority && nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) || (shaper->weight && nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /* Initialize the context fetching the relevant device and * acquiring a reference to it. */ static int net_shaper_ctx_setup(const struct genl_info *info, int type, struct net_shaper_nl_ctx *ctx) { struct net *ns = genl_info_net(info); struct net_device *dev; int ifindex; if (GENL_REQ_ATTR_CHECK(info, type)) return -EINVAL; ifindex = nla_get_u32(info->attrs[type]); dev = netdev_get_by_index(ns, ifindex, &ctx->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_BAD_ATTR(info->extack, info->attrs[type]); return -ENOENT; } if (!dev->netdev_ops->net_shaper_ops) { NL_SET_BAD_ATTR(info->extack, info->attrs[type]); netdev_put(dev, &ctx->dev_tracker); return -EOPNOTSUPP; } ctx->binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; ctx->binding.netdev = dev; return 0; } static void net_shaper_ctx_cleanup(struct net_shaper_nl_ctx *ctx) { if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV) netdev_put(ctx->binding.netdev, &ctx->dev_tracker); } static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle) { return FIELD_PREP(NET_SHAPER_SCOPE_MASK, handle->scope) | FIELD_PREP(NET_SHAPER_ID_MASK, handle->id); } static void net_shaper_index_to_handle(u32 index, struct net_shaper_handle *handle) { handle->scope = FIELD_GET(NET_SHAPER_SCOPE_MASK, index); handle->id = FIELD_GET(NET_SHAPER_ID_MASK, index); } static void net_shaper_default_parent(const struct net_shaper_handle *handle, struct net_shaper_handle *parent) { switch (handle->scope) { case NET_SHAPER_SCOPE_UNSPEC: case NET_SHAPER_SCOPE_NETDEV: case __NET_SHAPER_SCOPE_MAX: parent->scope = NET_SHAPER_SCOPE_UNSPEC; break; case NET_SHAPER_SCOPE_QUEUE: case NET_SHAPER_SCOPE_NODE: parent->scope = NET_SHAPER_SCOPE_NETDEV; break; } parent->id = 0; } /* * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as * it's cleared by xa_store(). */ #define NET_SHAPER_NOT_VALID XA_MARK_1 static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); u32 index = net_shaper_handle_to_index(handle); if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID)) return NULL; return xa_load(&hierarchy->shapers, index); } /* Allocate on demand the per device shaper's hierarchy container. * Called under the net shaper lock */ static struct net_shaper_hierarchy * net_shaper_hierarchy_setup(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); if (hierarchy) return hierarchy; hierarchy = kmalloc(sizeof(*hierarchy), GFP_KERNEL); if (!hierarchy) return NULL; /* The flag is required for ID allocation */ xa_init_flags(&hierarchy->shapers, XA_FLAGS_ALLOC); switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: /* Pairs with READ_ONCE in net_shaper_hierarchy. */ WRITE_ONCE(binding->netdev->net_shaper_hierarchy, hierarchy); break; } return hierarchy; } /* Prepare the hierarchy container to actually insert the given shaper, doing * in advance the needed allocations. */ static int net_shaper_pre_insert(struct net_shaper_binding *binding, struct net_shaper_handle *handle, struct netlink_ext_ack *extack) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *prev, *cur; bool id_allocated = false; int ret, index; if (!hierarchy) return -ENOMEM; index = net_shaper_handle_to_index(handle); cur = xa_load(&hierarchy->shapers, index); if (cur) return 0; /* Allocated a new id, if needed. */ if (handle->scope == NET_SHAPER_SCOPE_NODE && handle->id == NET_SHAPER_ID_UNSPEC) { u32 min, max; handle->id = NET_SHAPER_ID_MASK - 1; max = net_shaper_handle_to_index(handle); handle->id = 0; min = net_shaper_handle_to_index(handle); ret = xa_alloc(&hierarchy->shapers, &index, NULL, XA_LIMIT(min, max), GFP_KERNEL); if (ret < 0) { NL_SET_ERR_MSG(extack, "Can't allocate new id for NODE shaper"); return ret; } net_shaper_index_to_handle(index, handle); id_allocated = true; } cur = kzalloc(sizeof(*cur), GFP_KERNEL); if (!cur) { ret = -ENOMEM; goto free_id; } /* Mark 'tentative' shaper inside the hierarchy container. * xa_set_mark is a no-op if the previous store fails. */ xa_lock(&hierarchy->shapers); prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); xa_unlock(&hierarchy->shapers); if (xa_err(prev)) { NL_SET_ERR_MSG(extack, "Can't insert shaper into device store"); kfree_rcu(cur, rcu); ret = xa_err(prev); goto free_id; } return 0; free_id: if (id_allocated) xa_erase(&hierarchy->shapers, index); return ret; } /* Commit the tentative insert with the actual values. * Must be called only after a successful net_shaper_pre_insert(). */ static void net_shaper_commit(struct net_shaper_binding *binding, int nr_shapers, const struct net_shaper *shapers) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur; int index; int i; xa_lock(&hierarchy->shapers); for (i = 0; i < nr_shapers; ++i) { index = net_shaper_handle_to_index(&shapers[i].handle); cur = xa_load(&hierarchy->shapers, index); if (WARN_ON_ONCE(!cur)) continue; /* Successful update: drop the tentative mark * and update the hierarchy container. */ __xa_clear_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); *cur = shapers[i]; } xa_unlock(&hierarchy->shapers); } /* Rollback all the tentative inserts from the hierarchy. */ static void net_shaper_rollback(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur; unsigned long index; if (!hierarchy) return; xa_lock(&hierarchy->shapers); xa_for_each_marked(&hierarchy->shapers, index, cur, NET_SHAPER_NOT_VALID) { __xa_erase(&hierarchy->shapers, index); kfree(cur); } xa_unlock(&hierarchy->shapers); } static int net_shaper_parse_handle(const struct nlattr *attr, const struct genl_info *info, struct net_shaper_handle *handle) { struct nlattr *tb[NET_SHAPER_A_HANDLE_MAX + 1]; struct nlattr *id_attr; u32 id = 0; int ret; ret = nla_parse_nested(tb, NET_SHAPER_A_HANDLE_MAX, attr, net_shaper_handle_nl_policy, info->extack); if (ret < 0) return ret; if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NET_SHAPER_A_HANDLE_SCOPE)) return -EINVAL; handle->scope = nla_get_u32(tb[NET_SHAPER_A_HANDLE_SCOPE]); /* The default id for NODE scope shapers is an invalid one * to help the 'group' operation discriminate between new * NODE shaper creation (ID_UNSPEC) and reuse of existing * shaper (any other value). */ id_attr = tb[NET_SHAPER_A_HANDLE_ID]; if (id_attr) id = nla_get_u32(id_attr); else if (handle->scope == NET_SHAPER_SCOPE_NODE) id = NET_SHAPER_ID_UNSPEC; handle->id = id; return 0; } static int net_shaper_validate_caps(struct net_shaper_binding *binding, struct nlattr **tb, const struct genl_info *info, struct net_shaper *shaper) { const struct net_shaper_ops *ops = net_shaper_ops(binding); struct nlattr *bad = NULL; unsigned long caps = 0; ops->capabilities(binding, shaper->handle.scope, &caps); if (tb[NET_SHAPER_A_PRIORITY] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_PRIORITY))) bad = tb[NET_SHAPER_A_PRIORITY]; if (tb[NET_SHAPER_A_WEIGHT] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_WEIGHT))) bad = tb[NET_SHAPER_A_WEIGHT]; if (tb[NET_SHAPER_A_BW_MIN] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MIN))) bad = tb[NET_SHAPER_A_BW_MIN]; if (tb[NET_SHAPER_A_BW_MAX] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MAX))) bad = tb[NET_SHAPER_A_BW_MAX]; if (tb[NET_SHAPER_A_BURST] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BURST))) bad = tb[NET_SHAPER_A_BURST]; if (!caps) bad = tb[NET_SHAPER_A_HANDLE]; if (bad) { NL_SET_BAD_ATTR(info->extack, bad); return -EOPNOTSUPP; } if (shaper->handle.scope == NET_SHAPER_SCOPE_QUEUE && binding->type == NET_SHAPER_BINDING_TYPE_NETDEV && shaper->handle.id >= binding->netdev->real_num_tx_queues) { NL_SET_ERR_MSG_FMT(info->extack, "Not existing queue id %d max %d", shaper->handle.id, binding->netdev->real_num_tx_queues); return -ENOENT; } /* The metric is really used only if there is *any* rate-related * setting, either in current attributes set or in pre-existing * values. */ if (shaper->burst || shaper->bw_min || shaper->bw_max) { u32 metric_cap = NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS + shaper->metric; /* The metric test can fail even when the user did not * specify the METRIC attribute. Pointing to rate related * attribute will be confusing, as the attribute itself * could be indeed supported, with a different metric. * Be more specific. */ if (!(caps & BIT(metric_cap))) { NL_SET_ERR_MSG_FMT(info->extack, "Bad metric %d", shaper->metric); return -EOPNOTSUPP; } } return 0; } static int net_shaper_parse_info(struct net_shaper_binding *binding, struct nlattr **tb, const struct genl_info *info, struct net_shaper *shaper, bool *exists) { struct net_shaper *old; int ret; /* The shaper handle is the only mandatory attribute. */ if (NL_REQ_ATTR_CHECK(info->extack, NULL, tb, NET_SHAPER_A_HANDLE)) return -EINVAL; ret = net_shaper_parse_handle(tb[NET_SHAPER_A_HANDLE], info, &shaper->handle); if (ret) return ret; if (shaper->handle.scope == NET_SHAPER_SCOPE_UNSPEC) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); return -EINVAL; } /* Fetch existing hierarchy, if any, so that user provide info will * incrementally update the existing shaper configuration. */ old = net_shaper_lookup(binding, &shaper->handle); if (old) *shaper = *old; *exists = !!old; if (tb[NET_SHAPER_A_METRIC]) shaper->metric = nla_get_u32(tb[NET_SHAPER_A_METRIC]); if (tb[NET_SHAPER_A_BW_MIN]) shaper->bw_min = nla_get_uint(tb[NET_SHAPER_A_BW_MIN]); if (tb[NET_SHAPER_A_BW_MAX]) shaper->bw_max = nla_get_uint(tb[NET_SHAPER_A_BW_MAX]); if (tb[NET_SHAPER_A_BURST]) shaper->burst = nla_get_uint(tb[NET_SHAPER_A_BURST]); if (tb[NET_SHAPER_A_PRIORITY]) shaper->priority = nla_get_u32(tb[NET_SHAPER_A_PRIORITY]); if (tb[NET_SHAPER_A_WEIGHT]) shaper->weight = nla_get_u32(tb[NET_SHAPER_A_WEIGHT]); ret = net_shaper_validate_caps(binding, tb, info, shaper); if (ret < 0) return ret; return 0; } static int net_shaper_validate_nesting(struct net_shaper_binding *binding, const struct net_shaper *shaper, struct netlink_ext_ack *extack) { const struct net_shaper_ops *ops = net_shaper_ops(binding); unsigned long caps = 0; ops->capabilities(binding, shaper->handle.scope, &caps); if (!(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_NESTING))) { NL_SET_ERR_MSG_FMT(extack, "Nesting not supported for scope %d", shaper->handle.scope); return -EOPNOTSUPP; } return 0; } /* Fetch the existing leaf and update it with the user-provided * attributes. */ static int net_shaper_parse_leaf(struct net_shaper_binding *binding, const struct nlattr *attr, const struct genl_info *info, const struct net_shaper *node, struct net_shaper *shaper) { struct nlattr *tb[NET_SHAPER_A_WEIGHT + 1]; bool exists; int ret; ret = nla_parse_nested(tb, NET_SHAPER_A_WEIGHT, attr, net_shaper_leaf_info_nl_policy, info->extack); if (ret < 0) return ret; ret = net_shaper_parse_info(binding, tb, info, shaper, &exists); if (ret < 0) return ret; if (shaper->handle.scope != NET_SHAPER_SCOPE_QUEUE) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); return -EINVAL; } if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { ret = net_shaper_validate_nesting(binding, shaper, info->extack); if (ret < 0) return ret; } if (!exists) net_shaper_default_parent(&shaper->handle, &shaper->parent); return 0; } /* Alike net_parse_shaper_info(), but additionally allow the user specifying * the shaper's parent handle. */ static int net_shaper_parse_node(struct net_shaper_binding *binding, struct nlattr **tb, const struct genl_info *info, struct net_shaper *shaper) { bool exists; int ret; ret = net_shaper_parse_info(binding, tb, info, shaper, &exists); if (ret) return ret; if (shaper->handle.scope != NET_SHAPER_SCOPE_NODE && shaper->handle.scope != NET_SHAPER_SCOPE_NETDEV) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); return -EINVAL; } if (tb[NET_SHAPER_A_PARENT]) { ret = net_shaper_parse_handle(tb[NET_SHAPER_A_PARENT], info, &shaper->parent); if (ret) return ret; if (shaper->parent.scope != NET_SHAPER_SCOPE_NODE && shaper->parent.scope != NET_SHAPER_SCOPE_NETDEV) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_PARENT]); return -EINVAL; } } return 0; } static int net_shaper_generic_pre(struct genl_info *info, int type) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(info->ctx)); return net_shaper_ctx_setup(info, type, ctx); } int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { return net_shaper_generic_pre(info, NET_SHAPER_A_IFINDEX); } static void net_shaper_generic_post(struct genl_info *info) { net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)info->ctx); } void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { net_shaper_generic_post(info); } int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; const struct genl_info *info = genl_info_dump(cb); return net_shaper_ctx_setup(info, NET_SHAPER_A_IFINDEX, ctx); } int net_shaper_nl_post_dumpit(struct netlink_callback *cb) { net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)cb->ctx); return 0; } int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { return net_shaper_generic_pre(info, NET_SHAPER_A_CAPS_IFINDEX); } void net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { net_shaper_generic_post(info); } int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; return net_shaper_ctx_setup(genl_info_dump(cb), NET_SHAPER_A_CAPS_IFINDEX, ctx); } int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; net_shaper_ctx_cleanup(ctx); return 0; } int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_binding *binding; struct net_shaper_handle handle; struct net_shaper *shaper; struct sk_buff *msg; int ret; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, &handle); if (ret < 0) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); shaper = net_shaper_lookup(binding, &handle); if (!shaper) { NL_SET_BAD_ATTR(info->extack, info->attrs[NET_SHAPER_A_HANDLE]); rcu_read_unlock(); ret = -ENOENT; goto free_msg; } ret = net_shaper_fill_one(msg, binding, shaper, info); rcu_read_unlock(); if (ret) goto free_msg; ret = genlmsg_reply(msg, info); if (ret) goto free_msg; return 0; free_msg: nlmsg_free(msg); return ret; } int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; const struct genl_info *info = genl_info_dump(cb); struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; struct net_shaper *shaper; int ret = 0; /* Don't error out dumps performed before any set operation. */ binding = net_shaper_binding_from_ctx(ctx); hierarchy = net_shaper_hierarchy(binding); if (!hierarchy) return 0; rcu_read_lock(); for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, U32_MAX, XA_PRESENT)); ctx->start_index++) { ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; } rcu_read_unlock(); return ret; } int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; const struct net_shaper_ops *ops; struct net_shaper_handle handle; struct net_shaper shaper = {}; bool exists; int ret; binding = net_shaper_binding_from_ctx(info->ctx); net_shaper_lock(binding); ret = net_shaper_parse_info(binding, info->attrs, info, &shaper, &exists); if (ret) goto unlock; if (!exists) net_shaper_default_parent(&shaper.handle, &shaper.parent); hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { ret = -ENOMEM; goto unlock; } /* The 'set' operation can't create node-scope shapers. */ handle = shaper.handle; if (handle.scope == NET_SHAPER_SCOPE_NODE && !net_shaper_lookup(binding, &handle)) { ret = -ENOENT; goto unlock; } ret = net_shaper_pre_insert(binding, &handle, info->extack); if (ret) goto unlock; ops = net_shaper_ops(binding); ret = ops->set(binding, &shaper, info->extack); if (ret) { net_shaper_rollback(binding); goto unlock; } net_shaper_commit(binding, 1, &shaper); unlock: net_shaper_unlock(binding); return ret; } static int __net_shaper_delete(struct net_shaper_binding *binding, struct net_shaper *shaper, struct netlink_ext_ack *extack) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper_handle parent_handle, handle = shaper->handle; const struct net_shaper_ops *ops = net_shaper_ops(binding); int ret; again: parent_handle = shaper->parent; ret = ops->delete(binding, &handle, extack); if (ret < 0) return ret; xa_erase(&hierarchy->shapers, net_shaper_handle_to_index(&handle)); kfree_rcu(shaper, rcu); /* Eventually delete the parent, if it is left over with no leaves. */ if (parent_handle.scope == NET_SHAPER_SCOPE_NODE) { shaper = net_shaper_lookup(binding, &parent_handle); if (shaper && !--shaper->leaves) { handle = parent_handle; goto again; } } return 0; } static int net_shaper_handle_cmp(const struct net_shaper_handle *a, const struct net_shaper_handle *b) { /* Must avoid holes in struct net_shaper_handle. */ BUILD_BUG_ON(sizeof(*a) != 8); return memcmp(a, b, sizeof(*a)); } static int net_shaper_parent_from_leaves(int leaves_count, const struct net_shaper *leaves, struct net_shaper *node, struct netlink_ext_ack *extack) { struct net_shaper_handle parent = leaves[0].parent; int i; for (i = 1; i < leaves_count; ++i) { if (net_shaper_handle_cmp(&leaves[i].parent, &parent)) { NL_SET_ERR_MSG_FMT(extack, "All the leaves shapers must have the same old parent"); return -EINVAL; } } node->parent = parent; return 0; } static int __net_shaper_group(struct net_shaper_binding *binding, bool update_node, int leaves_count, struct net_shaper *leaves, struct net_shaper *node, struct netlink_ext_ack *extack) { const struct net_shaper_ops *ops = net_shaper_ops(binding); struct net_shaper_handle leaf_handle; struct net_shaper *parent = NULL; bool new_node = false; int i, ret; if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { new_node = node->handle.id == NET_SHAPER_ID_UNSPEC; if (!new_node && !net_shaper_lookup(binding, &node->handle)) { /* The related attribute is not available when * reaching here from the delete() op. */ NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists", node->handle.scope, node->handle.id); return -ENOENT; } /* When unspecified, the node parent scope is inherited from * the leaves. */ if (node->parent.scope == NET_SHAPER_SCOPE_UNSPEC) { ret = net_shaper_parent_from_leaves(leaves_count, leaves, node, extack); if (ret) return ret; } } else { net_shaper_default_parent(&node->handle, &node->parent); } if (node->parent.scope == NET_SHAPER_SCOPE_NODE) { parent = net_shaper_lookup(binding, &node->parent); if (!parent) { NL_SET_ERR_MSG_FMT(extack, "Node parent shaper %d:%d does not exists", node->parent.scope, node->parent.id); return -ENOENT; } ret = net_shaper_validate_nesting(binding, node, extack); if (ret < 0) return ret; } if (update_node) { /* For newly created node scope shaper, the following will * update the handle, due to id allocation. */ ret = net_shaper_pre_insert(binding, &node->handle, extack); if (ret) return ret; } for (i = 0; i < leaves_count; ++i) { leaf_handle = leaves[i].handle; ret = net_shaper_pre_insert(binding, &leaf_handle, extack); if (ret) goto rollback; if (!net_shaper_handle_cmp(&leaves[i].parent, &node->handle)) continue; /* The leaves shapers will be nested to the node, update the * linking accordingly. */ leaves[i].parent = node->handle; node->leaves++; } ret = ops->group(binding, leaves_count, leaves, node, extack); if (ret < 0) goto rollback; /* The node's parent gains a new leaf only when the node itself * is created by this group operation */ if (new_node && parent) parent->leaves++; if (update_node) net_shaper_commit(binding, 1, node); net_shaper_commit(binding, leaves_count, leaves); return 0; rollback: net_shaper_rollback(binding); return ret; } static int net_shaper_pre_del_node(struct net_shaper_binding *binding, const struct net_shaper *shaper, struct netlink_ext_ack *extack) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur, *leaves, node = {}; int ret, leaves_count = 0; unsigned long index; bool update_node; if (!shaper->leaves) return 0; /* Fetch the new node information. */ node.handle = shaper->parent; cur = net_shaper_lookup(binding, &node.handle); if (cur) { node = *cur; } else { /* A scope NODE shaper can be nested only to the NETDEV scope * shaper without creating the latter, this check may fail only * if the data is in inconsistent status. */ if (WARN_ON_ONCE(node.handle.scope != NET_SHAPER_SCOPE_NETDEV)) return -EINVAL; } leaves = kcalloc(shaper->leaves, sizeof(struct net_shaper), GFP_KERNEL); if (!leaves) return -ENOMEM; /* Build the leaves arrays. */ xa_for_each(&hierarchy->shapers, index, cur) { if (net_shaper_handle_cmp(&cur->parent, &shaper->handle)) continue; if (WARN_ON_ONCE(leaves_count == shaper->leaves)) { ret = -EINVAL; goto free; } leaves[leaves_count++] = *cur; } /* When re-linking to the netdev shaper, avoid the eventual, implicit, * creation of the new node, would be surprising since the user is * doing a delete operation. */ update_node = node.handle.scope != NET_SHAPER_SCOPE_NETDEV; ret = __net_shaper_group(binding, update_node, leaves_count, leaves, &node, extack); free: kfree(leaves); return ret; } int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; struct net_shaper_handle handle; struct net_shaper *shaper; int ret; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); net_shaper_lock(binding); ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, &handle); if (ret) goto unlock; hierarchy = net_shaper_hierarchy(binding); if (!hierarchy) { ret = -ENOENT; goto unlock; } shaper = net_shaper_lookup(binding, &handle); if (!shaper) { ret = -ENOENT; goto unlock; } if (handle.scope == NET_SHAPER_SCOPE_NODE) { ret = net_shaper_pre_del_node(binding, shaper, info->extack); if (ret) goto unlock; } ret = __net_shaper_delete(binding, shaper, info->extack); unlock: net_shaper_unlock(binding); return ret; } static int net_shaper_group_send_reply(struct net_shaper_binding *binding, const struct net_shaper_handle *handle, struct genl_info *info, struct sk_buff *msg) { void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) goto free_msg; if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || net_shaper_fill_handle(msg, handle, NET_SHAPER_A_HANDLE)) goto free_msg; genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); free_msg: /* Should never happen as msg is pre-allocated with enough space. */ WARN_ONCE(true, "calculated message payload length (%d)", net_shaper_handle_size()); nlmsg_free(msg); return -EMSGSIZE; } int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper **old_nodes, *leaves, node = {}; struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; int i, ret, rem, leaves_count; int old_nodes_count = 0; struct sk_buff *msg; struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); /* The group operation is optional. */ if (!net_shaper_ops(binding)->group) return -EOPNOTSUPP; net_shaper_lock(binding); leaves_count = net_shaper_list_len(info, NET_SHAPER_A_LEAVES); if (!leaves_count) { NL_SET_BAD_ATTR(info->extack, info->attrs[NET_SHAPER_A_LEAVES]); ret = -EINVAL; goto unlock; } leaves = kcalloc(leaves_count, sizeof(struct net_shaper) + sizeof(struct net_shaper *), GFP_KERNEL); if (!leaves) { ret = -ENOMEM; goto unlock; } old_nodes = (void *)&leaves[leaves_count]; ret = net_shaper_parse_node(binding, info->attrs, info, &node); if (ret) goto free_leaves; i = 0; nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) { if (WARN_ON_ONCE(i >= leaves_count)) goto free_leaves; ret = net_shaper_parse_leaf(binding, attr, info, &node, &leaves[i]); if (ret) goto free_leaves; i++; } /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. */ msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); if (!msg) goto free_leaves; hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { ret = -ENOMEM; goto free_msg; } /* Record the node shapers that this group() operation can make * childless for later cleanup. */ for (i = 0; i < leaves_count; i++) { if (leaves[i].parent.scope == NET_SHAPER_SCOPE_NODE && net_shaper_handle_cmp(&leaves[i].parent, &node.handle)) { struct net_shaper *tmp; tmp = net_shaper_lookup(binding, &leaves[i].parent); if (!tmp) continue; old_nodes[old_nodes_count++] = tmp; } } ret = __net_shaper_group(binding, true, leaves_count, leaves, &node, info->extack); if (ret) goto free_msg; /* Check if we need to delete any node left alone by the new leaves * linkage. */ for (i = 0; i < old_nodes_count; ++i) { struct net_shaper *tmp = old_nodes[i]; if (--tmp->leaves > 0) continue; /* Errors here are not fatal: the grouping operation is * completed, and user-space can still explicitly clean-up * left-over nodes. */ __net_shaper_delete(binding, tmp, info->extack); } ret = net_shaper_group_send_reply(binding, &node.handle, info, msg); if (ret) GENL_SET_ERR_MSG_FMT(info, "Can't send reply"); free_leaves: kfree(leaves); unlock: net_shaper_unlock(binding); return ret; free_msg: kfree_skb(msg); goto free_leaves; } static int net_shaper_cap_fill_one(struct sk_buff *msg, struct net_shaper_binding *binding, enum net_shaper_scope scope, unsigned long flags, const struct genl_info *info) { unsigned long cur; void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) return -EMSGSIZE; if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_CAPS_IFINDEX) || nla_put_u32(msg, NET_SHAPER_A_CAPS_SCOPE, scope)) goto nla_put_failure; for (cur = NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS; cur <= NET_SHAPER_A_CAPS_MAX; ++cur) { if (flags & BIT(cur) && nla_put_flag(msg, cur)) goto nla_put_failure; } genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_binding *binding; const struct net_shaper_ops *ops; enum net_shaper_scope scope; unsigned long flags = 0; struct sk_buff *msg; int ret; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_CAPS_SCOPE)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); scope = nla_get_u32(info->attrs[NET_SHAPER_A_CAPS_SCOPE]); ops = net_shaper_ops(binding); ops->capabilities(binding, scope, &flags); if (!flags) return -EOPNOTSUPP; msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = net_shaper_cap_fill_one(msg, binding, scope, flags, info); if (ret) goto free_msg; ret = genlmsg_reply(msg, info); if (ret) goto free_msg; return 0; free_msg: nlmsg_free(msg); return ret; } int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct net_shaper_binding *binding; const struct net_shaper_ops *ops; enum net_shaper_scope scope; int ret; binding = net_shaper_binding_from_ctx(cb->ctx); ops = net_shaper_ops(binding); for (scope = 0; scope <= NET_SHAPER_SCOPE_MAX; ++scope) { unsigned long flags = 0; ops->capabilities(binding, scope, &flags); if (!flags) continue; ret = net_shaper_cap_fill_one(skb, binding, scope, flags, info); if (ret) return ret; } return 0; } static void net_shaper_flush(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur; unsigned long index; if (!hierarchy) return; net_shaper_lock(binding); xa_lock(&hierarchy->shapers); xa_for_each(&hierarchy->shapers, index, cur) { __xa_erase(&hierarchy->shapers, index); kfree(cur); } xa_unlock(&hierarchy->shapers); net_shaper_unlock(binding); kfree(hierarchy); } void net_shaper_flush_netdev(struct net_device *dev) { struct net_shaper_binding binding = { .type = NET_SHAPER_BINDING_TYPE_NETDEV, .netdev = dev, }; net_shaper_flush(&binding); } void net_shaper_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding binding; int i; binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; binding.netdev = dev; hierarchy = net_shaper_hierarchy(&binding); if (!hierarchy) return; /* Only drivers implementing shapers support ensure * the lock is acquired in advance. */ netdev_assert_locked(dev); /* Take action only when decreasing the tx queue number. */ for (i = txq; i < dev->real_num_tx_queues; ++i) { struct net_shaper_handle handle, parent_handle; struct net_shaper *shaper; u32 index; handle.scope = NET_SHAPER_SCOPE_QUEUE; handle.id = i; shaper = net_shaper_lookup(&binding, &handle); if (!shaper) continue; /* Don't touch the H/W for the queue shaper, the drivers already * deleted the queue and related resources. */ parent_handle = shaper->parent; index = net_shaper_handle_to_index(&handle); xa_erase(&hierarchy->shapers, index); kfree_rcu(shaper, rcu); /* The recursion on parent does the full job. */ if (parent_handle.scope != NET_SHAPER_SCOPE_NODE) continue; shaper = net_shaper_lookup(&binding, &parent_handle); if (shaper && !--shaper->leaves) __net_shaper_delete(&binding, shaper, NULL); } } static int __init shaper_init(void) { return genl_register_family(&net_shaper_nl_family); } subsys_initcall(shaper_init); |
| 12 13 12 12 12 12 12 12 12 12 12 12 3 2 1 3 12 6 12 101 101 101 101 8 7 7 6 8 14 14 13 13 12 7 7 13 13 13 2 13 1 12 11 2 9 11 8 4 10 10 101 101 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | // SPDX-License-Identifier: GPL-2.0-only /* * vxcan.c - Virtual CAN Tunnel for cross namespace communication * * This code is derived from drivers/net/can/vcan.c for the virtual CAN * specific parts and from drivers/net/veth.c to implement the netlink API * for network interface pairs in a common and established way. * * Copyright (c) 2017 Oliver Hartkopp <socketcan@hartkopp.net> */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/can/vxcan.h> #include <linux/can/can-ml.h> #include <linux/slab.h> #include <net/rtnetlink.h> #define DRV_NAME "vxcan" MODULE_DESCRIPTION("Virtual CAN Tunnel"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); struct vxcan_priv { struct net_device __rcu *peer; }; static netdev_tx_t vxcan_xmit(struct sk_buff *oskb, struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer; struct net_device_stats *peerstats, *srcstats = &dev->stats; struct sk_buff *skb; unsigned int len; if (can_dropped_invalid_skb(dev, oskb)) return NETDEV_TX_OK; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (unlikely(!peer)) { kfree_skb(oskb); dev->stats.tx_dropped++; goto out_unlock; } skb_tx_timestamp(oskb); skb = skb_clone(oskb, GFP_ATOMIC); if (skb) { consume_skb(oskb); } else { kfree_skb(oskb); goto out_unlock; } /* reset CAN GW hop counter */ skb->csum_start = 0; skb->pkt_type = PACKET_BROADCAST; skb->dev = peer; skb->ip_summed = CHECKSUM_UNNECESSARY; len = can_skb_get_data_len(skb); if (netif_rx(skb) == NET_RX_SUCCESS) { srcstats->tx_packets++; srcstats->tx_bytes += len; peerstats = &peer->stats; peerstats->rx_packets++; peerstats->rx_bytes += len; } out_unlock: rcu_read_unlock(); return NETDEV_TX_OK; } static int vxcan_open(struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } return 0; } static int vxcan_close(struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); netif_carrier_off(dev); if (peer) netif_carrier_off(peer); return 0; } static int vxcan_get_iflink(const struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer; int iflink; rcu_read_lock(); peer = rcu_dereference(priv->peer); iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; } static int vxcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU && !can_is_canxl_dev_mtu(new_mtu)) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static const struct net_device_ops vxcan_netdev_ops = { .ndo_open = vxcan_open, .ndo_stop = vxcan_close, .ndo_start_xmit = vxcan_xmit, .ndo_get_iflink = vxcan_get_iflink, .ndo_change_mtu = vxcan_change_mtu, }; static const struct ethtool_ops vxcan_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void vxcan_setup(struct net_device *dev) { struct can_ml_priv *can_ml; dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; dev->netdev_ops = &vxcan_netdev_ops; dev->ethtool_ops = &vxcan_ethtool_ops; dev->needs_free_netdev = true; can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); can_set_ml_priv(dev, can_ml); } /* forward declaration for rtnl_create_link() */ static struct rtnl_link_ops vxcan_link_ops; static int vxcan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *peer_net = rtnl_newlink_peer_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct vxcan_priv *priv; struct net_device *peer; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb; char ifname[IFNAMSIZ]; unsigned char name_assign_type; struct ifinfomsg *ifmp = NULL; int err; /* register peer device */ if (data && data[VXCAN_INFO_PEER]) { struct nlattr *nla_peer = data[VXCAN_INFO_PEER]; ifmp = nla_data(nla_peer); rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } if (ifmp && tbp[IFLA_IFNAME]) { nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); name_assign_type = NET_NAME_ENUM; } peer = rtnl_create_link(peer_net, ifname, name_assign_type, &vxcan_link_ops, tbp, extack); if (IS_ERR(peer)) return PTR_ERR(peer); if (ifmp && dev->ifindex) peer->ifindex = ifmp->ifi_index; err = register_netdevice(peer); if (err < 0) { free_netdev(peer); return err; } netif_carrier_off(peer); err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto unregister_network_device; /* register first device */ if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); err = register_netdevice(dev); if (err < 0) goto unregister_network_device; netif_carrier_off(dev); /* cross link the device pair */ priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); return 0; unregister_network_device: unregister_netdevice(peer); return err; } static void vxcan_dellink(struct net_device *dev, struct list_head *head) { struct vxcan_priv *priv; struct net_device *peer; priv = netdev_priv(dev); peer = rtnl_dereference(priv->peer); /* Note : dellink() is called from default_device_exit_batch(), * before a rcu_synchronize() point. The devices are guaranteed * not being freed before one RCU grace period. */ RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(dev, head); if (peer) { priv = netdev_priv(peer); RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(peer, head); } } static const struct nla_policy vxcan_policy[VXCAN_INFO_MAX + 1] = { [VXCAN_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, }; static struct net *vxcan_get_link_net(const struct net_device *dev) { struct vxcan_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); return peer ? dev_net(peer) : dev_net(dev); } static struct rtnl_link_ops vxcan_link_ops = { .kind = DRV_NAME, .priv_size = ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN) + sizeof(struct can_ml_priv), .setup = vxcan_setup, .newlink = vxcan_newlink, .dellink = vxcan_dellink, .policy = vxcan_policy, .peer_type = VXCAN_INFO_PEER, .maxtype = VXCAN_INFO_MAX, .get_link_net = vxcan_get_link_net, }; static __init int vxcan_init(void) { pr_info("vxcan: Virtual CAN Tunnel driver\n"); return rtnl_link_register(&vxcan_link_ops); } static __exit void vxcan_exit(void) { rtnl_link_unregister(&vxcan_link_ops); } module_init(vxcan_init); module_exit(vxcan_exit); |
| 453 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/dir.c * * Copyright (C) 1992 Rick Sladkey * * nfs directory handling functions * * 10 Apr 1996 Added silly rename for unlink --okir * 28 Sep 1996 Improved directory cache --okir * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de * Re-implemented silly rename for unlink, newly implemented * silly rename for nfs_rename() following the suggestions * of Olaf Kirch (okir) found in this file. * Following Linus comments on my original hack, this version * depends only on the dcache stuff and doesn't touch the inode * layer (iput() and friends). * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ #include <linux/compat.h> #include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/swap.h> #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> #include <linux/hash.h> #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "nfstrace.h" /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct folio *); static int nfs_do_create(struct inode *dir, struct dentry *dentry, umode_t mode, int open_flags); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, .iterate_shared = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, }; const struct address_space_operations nfs_dir_aops = { .free_folio = nfs_readdir_clear_array, }; #define NFS_INIT_DTSIZE PAGE_SIZE static struct nfs_open_dir_context * alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (ctx != NULL) { ctx->attr_gencount = nfsi->attr_gencount; ctx->dtsize = NFS_INIT_DTSIZE; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) nfs_set_cache_invalid(dir, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf)); spin_unlock(&dir->i_lock); return ctx; } return ERR_PTR(-ENOMEM); } static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { spin_lock(&dir->i_lock); list_del_rcu(&ctx->list); spin_unlock(&dir->i_lock); kfree_rcu(ctx, rcu_head); } /* * Open file */ static int nfs_opendir(struct inode *inode, struct file *filp) { int res = 0; struct nfs_open_dir_context *ctx; dfprintk(FILE, "NFS: open dir(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSOPEN); ctx = alloc_nfs_open_dir_context(inode); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; } filp->private_data = ctx; out: return res; } static int nfs_closedir(struct inode *inode, struct file *filp) { put_nfs_open_dir_context(file_inode(filp), filp->private_data); return 0; } struct nfs_cache_array_entry { u64 cookie; u64 ino; const char *name; unsigned int name_len; unsigned char d_type; }; struct nfs_cache_array { u64 change_attr; u64 last_cookie; unsigned int size; unsigned char folio_full : 1, folio_is_eof : 1, cookies_are_ordered : 1; struct nfs_cache_array_entry array[] __counted_by(size); }; struct nfs_readdir_descriptor { struct file *file; struct folio *folio; struct dir_context *ctx; pgoff_t folio_index; pgoff_t folio_index_max; u64 dir_cookie; u64 last_cookie; loff_t current_index; __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; unsigned long attr_gencount; unsigned int cache_entry_index; unsigned int buffer_fills; unsigned int dtsize; bool clear_cache; bool plus; bool eob; bool eof; }; static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz) { struct nfs_server *server = NFS_SERVER(file_inode(desc->file)); unsigned int maxsize = server->dtsize; if (sz > maxsize) sz = maxsize; if (sz < NFS_MIN_FILE_IO_SIZE) sz = NFS_MIN_FILE_IO_SIZE; desc->dtsize = sz; } static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc) { nfs_set_dtsize(desc, desc->dtsize >> 1); } static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) { nfs_set_dtsize(desc, desc->dtsize << 1); } static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie, u64 change_attr) { struct nfs_cache_array *array; array = kmap_local_folio(folio, 0); array->change_attr = change_attr; array->last_cookie = last_cookie; array->size = 0; array->folio_full = 0; array->folio_is_eof = 0; array->cookies_are_ordered = 1; kunmap_local(array); } /* * we are freeing strings created by nfs_add_to_readdir_array() */ static void nfs_readdir_clear_array(struct folio *folio) { struct nfs_cache_array *array; unsigned int i; array = kmap_local_folio(folio, 0); for (i = 0; i < array->size; i++) kfree(array->array[i].name); array->size = 0; kunmap_local(array); } static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie, u64 change_attr) { nfs_readdir_clear_array(folio); nfs_readdir_folio_init_array(folio, last_cookie, change_attr); } static struct folio * nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags) { struct folio *folio = folio_alloc(gfp_flags, 0); if (folio) nfs_readdir_folio_init_array(folio, last_cookie, 0); return folio; } static void nfs_readdir_folio_array_free(struct folio *folio) { if (folio) { nfs_readdir_clear_array(folio); folio_put(folio); } } static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) { return array->size == 0 ? array->last_cookie : array->array[0].cookie; } static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { array->folio_is_eof = 1; array->folio_full = 1; } static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) { return array->folio_full; } /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in * nfs_clear_readdir_array() */ static const char *nfs_readdir_copy_name(const char *name, unsigned int len) { const char *ret = kmemdup_nul(name, len, GFP_KERNEL); /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ if (ret != NULL) kmemleak_not_leak(ret); return ret; } static size_t nfs_readdir_array_maxentries(void) { return (PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry); } /* * Check that the next array entry lies entirely within the page bounds */ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) { if (array->folio_full) return -ENOSPC; if (array->size == nfs_readdir_array_maxentries()) { array->folio_full = 1; return -ENOSPC; } return 0; } static int nfs_readdir_folio_array_append(struct folio *folio, const struct nfs_entry *entry, u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; const char *name; int ret = -ENOMEM; name = nfs_readdir_copy_name(entry->name, entry->len); array = kmap_local_folio(folio, 0); if (!name) goto out; ret = nfs_readdir_array_can_expand(array); if (ret) { kfree(name); goto out; } array->size++; cache_entry = &array->array[array->size - 1]; cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; cache_entry->name_len = entry->len; cache_entry->name = name; array->last_cookie = entry->cookie; if (array->last_cookie <= cache_entry->cookie) array->cookies_are_ordered = 0; if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: *cookie = array->last_cookie; kunmap_local(array); return ret; } #define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14) /* * Hash algorithm allowing content addressible access to sequences * of directory cookies. Content is addressed by the value of the * cookie index of the first readdir entry in a page. * * We select only the first 18 bits to avoid issues with excessive * memory use for the page cache XArray. 18 bits should allow the caching * of 262144 pages of sequences of readdir entries. Since each page holds * 127 readdir entries for a typical 64-bit system, that works out to a * cache of ~ 33 million entries per directory. */ static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie) { if (cookie == 0) return 0; return hash_64(cookie, 18); } static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie, u64 change_attr) { struct nfs_cache_array *array = kmap_local_folio(folio, 0); int ret = true; if (array->change_attr != change_attr) ret = false; if (nfs_readdir_array_index_cookie(array) != last_cookie) ret = false; kunmap_local(array); return ret; } static void nfs_readdir_folio_unlock_and_put(struct folio *folio) { folio_unlock(folio); folio_put(folio); } static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie, u64 change_attr) { if (folio_test_uptodate(folio)) { if (nfs_readdir_folio_validate(folio, cookie, change_attr)) return; nfs_readdir_clear_array(folio); } nfs_readdir_folio_init_array(folio, cookie, change_attr); folio_mark_uptodate(folio); } static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping, u64 cookie, u64 change_attr) { pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); struct folio *folio; folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return NULL; nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); return folio; } static u64 nfs_readdir_folio_last_cookie(struct folio *folio) { struct nfs_cache_array *array; u64 ret; array = kmap_local_folio(folio, 0); ret = array->last_cookie; kunmap_local(array); return ret; } static bool nfs_readdir_folio_needs_filling(struct folio *folio) { struct nfs_cache_array *array; bool ret; array = kmap_local_folio(folio, 0); ret = !nfs_readdir_array_is_full(array); kunmap_local(array); return ret; } static void nfs_readdir_folio_set_eof(struct folio *folio) { struct nfs_cache_array *array; array = kmap_local_folio(folio, 0); nfs_readdir_array_set_eof(array); kunmap_local(array); } static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping, u64 cookie, u64 change_attr) { pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); struct folio *folio; folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return NULL; nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); if (nfs_readdir_folio_last_cookie(folio) != cookie) nfs_readdir_folio_reinit_array(folio, cookie, change_attr); return folio; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } static bool nfs_readdir_use_cookie(const struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return false; return true; } static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { if (array->folio_full) { desc->last_cookie = array->last_cookie; desc->current_index += array->size; desc->cache_entry_index = 0; desc->folio_index++; } else desc->last_cookie = nfs_readdir_array_index_cookie(array); } static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) { desc->current_index = 0; desc->last_cookie = 0; desc->folio_index = 0; } static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; if (diff < 0) goto out_eof; if (diff >= array->size) { if (array->folio_is_eof) goto out_eof; nfs_readdir_seek_next_array(array, desc); return -EAGAIN; } index = (unsigned int)diff; desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; return 0; out_eof: desc->eof = true; return -EBADCOOKIE; } static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, u64 cookie) { if (!array->cookies_are_ordered) return true; /* Optimisation for monotonically increasing cookies */ if (cookie >= array->last_cookie) return false; if (array->size && cookie < array->array[0].cookie) return false; return true; } static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { unsigned int i; int status = -EAGAIN; if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) goto check_eof; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == desc->dir_cookie) { if (nfs_readdir_use_cookie(desc->file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos = desc->current_index + i; desc->cache_entry_index = i; return 0; } } check_eof: if (array->folio_is_eof) { status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; } else nfs_readdir_seek_next_array(array, desc); return status; } static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) { struct nfs_cache_array *array; int status; array = kmap_local_folio(desc->folio, 0); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); kunmap_local(array); return status; } /* Fill a page with xdr information before transferring to the cache page */ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, __be32 *verf, u64 cookie, struct page **pages, size_t bufsize, __be32 *verf_res) { struct inode *inode = file_inode(desc->file); struct nfs_readdir_arg arg = { .dentry = file_dentry(desc->file), .cred = desc->file->f_cred, .verf = verf, .cookie = cookie, .pages = pages, .page_len = bufsize, .plus = desc->plus, }; struct nfs_readdir_res res = { .verf = verf_res, }; unsigned long timestamp, gencount; int error; again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); desc->dir_verifier = nfs_save_change_attribute(inode); error = NFS_PROTO(inode)->readdir(&arg, &res); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; desc->plus = arg.plus = false; goto again; } goto error; } desc->timestamp = timestamp; desc->gencount = gencount; error: return error; } static int xdr_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { struct inode *inode = file_inode(desc->file); int error; error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus); if (error) return error; entry->fattr->time_start = desc->timestamp; entry->fattr->gencount = desc->gencount; return 0; } /* Match file and dirent using either filehandle or fileid * Note: caller is responsible for checking the fsid */ static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { struct inode *inode; struct nfs_inode *nfsi; if (d_really_is_negative(dentry)) return 0; inode = d_inode(dentry); if (is_bad_inode(inode) || NFS_STALE(inode)) return 0; nfsi = NFS_I(inode); if (entry->fattr->fileid != nfsi->fileid) return 0; if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) return 0; return 1; } #define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL) static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, unsigned int cache_hits, unsigned int cache_misses) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS) return true; if (ctx->pos == 0 || cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; return false; } /* * This function is called by the getattr code to request the * use of readdirplus to accelerate any future lookups in the same * directory. */ void nfs_readdir_record_entry_cache_hit(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && S_ISDIR(dir->i_mode)) { rcu_read_lock(); list_for_each_entry_rcu (ctx, &nfsi->open_files, list) atomic_inc(&ctx->cache_hits); rcu_read_unlock(); } } /* * This function is mainly for use by nfs_getattr(). * * If this is an 'ls -l', we want to force use of readdirplus. */ void nfs_readdir_record_entry_cache_miss(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && S_ISDIR(dir->i_mode)) { rcu_read_lock(); list_for_each_entry_rcu (ctx, &nfsi->open_files, list) atomic_inc(&ctx->cache_misses); rcu_read_unlock(); } } static void nfs_lookup_advise_force_readdirplus(struct inode *dir, unsigned int flags) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) return; if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL)) return; nfs_readdir_record_entry_cache_miss(dir); } static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; int status; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; if (filename.len == 0) return; /* Validate that the name doesn't contain any illegal '\0' */ if (strnlen(filename.name, filename.len) != filename.len) return; /* ...or '/' */ if (strnchr(filename.name, filename.len, '/')) return; if (filename.name[0] == '.') { if (filename.len == 1) return; if (filename.len == 2 && filename.name[1] == '.') return; } filename.hash = full_name_hash(parent, filename.name, filename.len); dentry = d_lookup(parent, &filename); again: if (!dentry) { dentry = d_alloc_parallel(parent, &filename, &wq); if (IS_ERR(dentry)) return; } if (!d_in_lookup(dentry)) { /* Is there a mountpoint here? If so, just exit */ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, &entry->fattr->fsid)) goto out; if (nfs_same_file(dentry, entry)) { if (!entry->fh->size) goto out; nfs_set_verifier(dentry, dir_verifier); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) nfs_setsecurity(d_inode(dentry), entry->fattr); trace_nfs_readdir_lookup_revalidate(d_inode(parent), dentry, 0, status); goto out; } else { trace_nfs_readdir_lookup_revalidate_failed( d_inode(parent), dentry, 0); d_invalidate(dentry); dput(dentry); dentry = NULL; goto again; } } if (!entry->fh->size) { d_lookup_done(dentry); goto out; } inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); alias = d_splice_alias(inode, dentry); d_lookup_done(dentry); if (alias) { if (IS_ERR(alias)) goto out; dput(dentry); dentry = alias; } nfs_set_verifier(dentry, dir_verifier); trace_nfs_readdir_lookup(d_inode(parent), dentry, 0); out: dput(dentry); } static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *stream) { int ret; if (entry->fattr->label) entry->fattr->label->len = NFS4_MAXLABELLEN; ret = xdr_decode(desc, entry, stream); if (ret || !desc->plus) return ret; nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); return 0; } /* Perform conversion from xdr to cache array */ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct page **xdr_pages, unsigned int buflen, struct folio **arrays, size_t narrays, u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; struct folio *new, *folio = *arrays; struct xdr_stream stream; struct page *scratch; struct xdr_buf buf; u64 cookie; int status; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_page(&stream, scratch); do { status = nfs_readdir_entry_decode(desc, entry, &stream); if (status != 0) break; status = nfs_readdir_folio_array_append(folio, entry, &cookie); if (status != -ENOSPC) continue; if (folio->mapping != mapping) { if (!--narrays) break; new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; *arrays = folio = new; } else { new = nfs_readdir_folio_get_next(mapping, cookie, change_attr); if (!new) break; if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); folio = new; } desc->folio_index_max++; status = nfs_readdir_folio_array_append(folio, entry, &cookie); } while (!status && !entry->eof); switch (status) { case -EBADCOOKIE: if (!entry->eof) break; nfs_readdir_folio_set_eof(folio); fallthrough; case -EAGAIN: status = 0; break; case -ENOSPC: status = 0; if (!desc->plus) break; while (!nfs_readdir_entry_decode(desc, entry, &stream)) ; } if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); put_page(scratch); return status; } static void nfs_readdir_free_pages(struct page **pages, size_t npages) { while (npages--) put_page(pages[npages]); kfree(pages); } /* * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call * to nfs_readdir_free_pages() */ static struct page **nfs_readdir_alloc_pages(size_t npages) { struct page **pages; size_t i; pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); if (!pages) return NULL; for (i = 0; i < npages; i++) { struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) goto out_freepages; pages[i] = page; } return pages; out_freepages: nfs_readdir_free_pages(pages, i); return NULL; } static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, struct folio **arrays, size_t narrays) { u64 change_attr; struct page **pages; struct folio *folio = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); unsigned int dtsize = desc->dtsize; unsigned int pglen; int status = -ENOMEM; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->cookie = nfs_readdir_folio_last_cookie(folio); entry->fh = nfs_alloc_fhandle(); entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); entry->server = NFS_SERVER(inode); if (entry->fh == NULL || entry->fattr == NULL) goto out; array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT; pages = nfs_readdir_alloc_pages(array_size); if (!pages) goto out; change_attr = inode_peek_iversion_raw(inode); status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages, dtsize, verf_res); if (status < 0) goto free_pages; pglen = status; if (pglen != 0) status = nfs_readdir_folio_filler(desc, entry, pages, pglen, arrays, narrays, change_attr); else nfs_readdir_folio_set_eof(folio); desc->buffer_fills++; free_pages: nfs_readdir_free_pages(pages, array_size); out: nfs_free_fattr(entry->fattr); nfs_free_fhandle(entry->fh); kfree(entry); return status; } static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc) { folio_put(desc->folio); desc->folio = NULL; } static void nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { folio_unlock(desc->folio); nfs_readdir_folio_put(desc); } static struct folio * nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc) { struct address_space *mapping = desc->file->f_mapping; u64 change_attr = inode_peek_iversion_raw(mapping->host); u64 cookie = desc->last_cookie; struct folio *folio; folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr); if (!folio) return NULL; if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio)) nfs_readdir_folio_reinit_array(folio, cookie, change_attr); return folio; } /* * Returns 0 if desc->dir_cookie was found on page desc->page_index * and locks the page to prevent removal from the page cache. */ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { struct inode *inode = file_inode(desc->file); struct nfs_inode *nfsi = NFS_I(inode); __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; desc->folio = nfs_readdir_folio_get_cached(desc); if (!desc->folio) return -ENOMEM; if (nfs_readdir_folio_needs_filling(desc->folio)) { /* Grow the dtsize if we had to go back for more pages */ if (desc->folio_index == desc->folio_index_max) nfs_grow_dtsize(desc); desc->folio_index_max = desc->folio_index; trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, desc->last_cookie, desc->folio->index, desc->dtsize); res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, &desc->folio, 1); if (res < 0) { nfs_readdir_folio_unlock_and_put_cached(desc); trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); nfs_readdir_rewind_search(desc); trace_nfs_readdir_invalidate_cache_range( inode, 0, MAX_LFS_FILESIZE); return -EAGAIN; } return res; } /* * Set the cookie verifier if the page cache was empty */ if (desc->last_cookie == 0 && memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) { memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); invalidate_inode_pages2_range(desc->file->f_mapping, 1, -1); trace_nfs_readdir_invalidate_cache_range( inode, 1, MAX_LFS_FILESIZE); } desc->clear_cache = false; } res = nfs_readdir_search_array(desc); if (res == 0) return 0; nfs_readdir_folio_unlock_and_put_cached(desc); return res; } /* Search for desc->dir_cookie from the beginning of the page cache */ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; do { res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; } #define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) /* * Once we've found the start of the dirent within a page: fill 'er up... */ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, const __be32 *verf) { struct file *file = desc->file; struct nfs_cache_array *array; unsigned int i; bool first_emit = !desc->dir_cookie; array = kmap_local_folio(desc->folio, 0); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; /* * nfs_readdir_handle_cache_misses return force clear at * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 * entries need be emitted here. */ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { desc->eob = true; break; } ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eob = true; break; } memcpy(desc->verf, verf, sizeof(desc->verf)); if (i == array->size - 1) { desc->dir_cookie = array->last_cookie; nfs_readdir_seek_next_array(array, desc); } else { desc->dir_cookie = array->array[i + 1].cookie; desc->last_cookie = array->array[0].cookie; } if (nfs_readdir_use_cookie(file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; } if (array->folio_is_eof) desc->eof = !desc->eob; kunmap_local(array); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", (unsigned long long)desc->dir_cookie); } /* * If we cannot find a cookie in our cache, we suspect that this is * because it points to a deleted file, so we ask the server to return * whatever it thinks is the next entry. We then feed this to filldir. * If all goes well, we should then be able to find our way round the * cache on the next call to readdir_search_pagecache(); * * NOTE: we cannot add the anonymous page to the pagecache because * the data it contains might not be page aligned. Besides, * we should already have a complete representation of the * directory in the page cache by the time we get here. */ static int uncached_readdir(struct nfs_readdir_descriptor *desc) { struct folio **arrays; size_t i, sz = 512; __be32 verf[NFS_DIR_VERIFIER_SIZE]; int status = -ENOMEM; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n", (unsigned long long)desc->dir_cookie); arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); if (!arrays) goto out; arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL); if (!arrays[0]) goto out; desc->folio_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; desc->folio_index_max = 0; trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, -1, desc->dtsize); status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); if (status < 0) { trace_nfs_readdir_uncached_done(file_inode(desc->file), status); goto out_free; } for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->folio = arrays[i]; nfs_do_filldir(desc, verf); } desc->folio = NULL; /* * Grow the dtsize if we have to go back for more pages, * or shrink it if we're reading too many. */ if (!desc->eof) { if (!desc->eob) nfs_grow_dtsize(desc); else if (desc->buffer_fills == 1 && i < (desc->folio_index_max >> 1)) nfs_shrink_dtsize(desc); } out_free: for (i = 0; i < sz && arrays[i]; i++) nfs_readdir_folio_array_free(arrays[i]); out: if (!nfs_readdir_use_cookie(desc->file)) nfs_readdir_rewind_search(desc); desc->folio_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } static bool nfs_readdir_handle_cache_misses(struct inode *inode, struct nfs_readdir_descriptor *desc, unsigned int cache_misses, bool force_clear) { if (desc->ctx->pos == 0 || !desc->plus) return false; if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear) return false; trace_nfs_readdir_force_readdirplus(inode); return true; } /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. */ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; unsigned int cache_hits, cache_misses; bool force_clear; int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); /* * ctx->pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have * to either find the entry with the appropriate number or * revalidate the cookie. */ nfs_revalidate_mapping(inode, file->f_mapping); res = -ENOMEM; desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) goto out; desc->file = file; desc->ctx = ctx; desc->folio_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; desc->folio_index = dir_ctx->page_index; desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; nfs_set_dtsize(desc, dir_ctx->dtsize); memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); force_clear = dir_ctx->force_clear; spin_unlock(&file->f_lock); if (desc->eof) { res = 0; goto out_free; } desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses, force_clear); desc->clear_cache = force_clear; do { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ if (desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) continue; if (res == -EBADCOOKIE || res == -ENOTSYNC) res = 0; } break; } if (res == -ETOOSMALL && desc->plus) { nfs_zap_caches(inode); desc->plus = false; desc->eof = false; continue; } if (res < 0) break; nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_folio_unlock_and_put_cached(desc); if (desc->folio_index == desc->folio_index_max) desc->clear_cache = force_clear; } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; dir_ctx->last_cookie = desc->last_cookie; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->folio_index; dir_ctx->force_clear = force_clear; dir_ctx->eof = desc->eof; dir_ctx->dtsize = desc->dtsize; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); out_free: kfree(desc); out: dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; } static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); switch (whence) { default: return -EINVAL; case SEEK_SET: if (offset < 0) return -EINVAL; spin_lock(&filp->f_lock); break; case SEEK_CUR: if (offset == 0) return filp->f_pos; spin_lock(&filp->f_lock); offset += filp->f_pos; if (offset < 0) { spin_unlock(&filp->f_lock); return -EINVAL; } } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->page_index = 0; if (!nfs_readdir_use_cookie(filp)) { dir_ctx->dir_cookie = 0; dir_ctx->last_cookie = 0; } else { dir_ctx->dir_cookie = offset; dir_ctx->last_cookie = offset; } dir_ctx->eof = false; } spin_unlock(&filp->f_lock); return offset; } /* * All directory operations under NFS are synchronous, so fsync() * is a dummy operation. */ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, int datasync) { dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC); return 0; } /** * nfs_force_lookup_revalidate - Mark the directory as having changed * @dir: pointer to directory inode * * This forces the revalidation code in nfs_lookup_revalidate() to do a * full lookup on all child dentries of 'dir' whenever a change occurs * on the server that might have invalidated our dcache. * * Note that we reserve bit '0' as a tag to let us know when a dentry * was revalidated while holding a delegation on its inode. * * The caller should be holding dir->i_lock */ void nfs_force_lookup_revalidate(struct inode *dir) { NFS_I(dir)->cache_change_attribute += 2; } EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); /** * nfs_verify_change_attribute - Detects NFS remote directory changes * @dir: pointer to parent directory inode * @verf: previously saved change attribute * * Return "false" if the verifiers doesn't match the change attribute. * This would usually indicate that the directory contents have changed on * the server, and that any dentries need revalidating. */ static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf) { return (verf & ~1UL) == nfs_save_change_attribute(dir); } static void nfs_set_verifier_delegated(unsigned long *verf) { *verf |= 1UL; } #if IS_ENABLED(CONFIG_NFS_V4) static void nfs_unset_verifier_delegated(unsigned long *verf) { *verf &= ~1UL; } #endif /* IS_ENABLED(CONFIG_NFS_V4) */ static bool nfs_test_verifier_delegated(unsigned long verf) { return verf & 1; } static bool nfs_verifier_is_delegated(struct dentry *dentry) { return nfs_test_verifier_delegated(dentry->d_time); } static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) { struct inode *inode = d_inode(dentry); struct inode *dir = d_inode_rcu(dentry->d_parent); if (!dir || !nfs_verify_change_attribute(dir, verf)) return; if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0)) nfs_set_verifier_delegated(&verf); dentry->d_time = verf; } /** * nfs_set_verifier - save a parent directory verifier in the dentry * @dentry: pointer to dentry * @verf: verifier to save * * Saves the parent directory verifier in @dentry. If the inode has * a delegation, we also tag the dentry as having been revalidated * while holding a delegation so that we know we don't have to * look it up again after a directory change. */ void nfs_set_verifier(struct dentry *dentry, unsigned long verf) { spin_lock(&dentry->d_lock); nfs_set_verifier_locked(dentry, verf); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL_GPL(nfs_set_verifier); #if IS_ENABLED(CONFIG_NFS_V4) /** * nfs_clear_verifier_delegated - clear the dir verifier delegation tag * @inode: pointer to inode * * Iterates through the dentries in the inode alias list and clears * the tag used to indicate that the dentry has been revalidated * while holding a delegation. * This function is intended for use when the delegation is being * returned or revoked. */ void nfs_clear_verifier_delegated(struct inode *inode) { struct dentry *alias; if (!inode) return; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); nfs_unset_verifier_delegated(&alias->d_time); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) && d_really_is_negative(dentry)) return dentry->d_time == inode_peek_iversion_raw(dir); return nfs_verify_change_attribute(dir, dentry->d_time); } /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy * and may need to be looked up again. * If rcu_walk prevents us from performing a full check, return 0. */ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, int rcu_walk) { if (IS_ROOT(dentry)) return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) return 0; if (!nfs_dentry_verify_change(dir, dentry)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ if (nfs_mapping_need_revalidate_inode(dir)) { if (rcu_walk) return 0; if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) return 0; } if (!nfs_dentry_verify_change(dir, dentry)) return 0; return 1; } /* * Use intent information to check whether or not we're going to do * an O_EXCL create using this path component. */ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) == (LOOKUP_CREATE | LOOKUP_EXCL); } /* * Inode and filehandle revalidation for lookups. * * We force revalidation in the cases where the VFS sets LOOKUP_REVAL, * or if the intent information indicates that we're about to open this * particular file and the "nocto" mount flag is not set. * */ static int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) { struct nfs_server *server = NFS_SERVER(inode); int ret; if (IS_AUTOMOUNT(inode)) return 0; if (flags & LOOKUP_OPEN) { switch (inode->i_mode & S_IFMT) { case S_IFREG: /* A NFSv4 OPEN will revalidate later */ if (server->caps & NFS_CAP_ATOMIC_OPEN) goto out; fallthrough; case S_IFDIR: if (server->flags & NFS_MOUNT_NOCTO) break; /* NFS close-to-open cache consistency validation */ goto out_force; } } /* VFS wants an on-the-wire revalidation */ if (flags & LOOKUP_REVAL) goto out_force; out: if (inode->i_nlink > 0 || (inode->i_nlink == 0 && test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags))) return 0; else return -ESTALE; out_force: if (flags & LOOKUP_RCU) return -ECHILD; ret = __nfs_revalidate_inode(server, inode); if (ret != 0) return ret; goto out; } static void nfs_mark_dir_for_revalidate(struct inode *inode) { spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); spin_unlock(&inode->i_lock); } /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. * * If parent mtime has changed, we revalidate, else we wait for a * period corresponding to the parent's attribute cache timeout value. * * If LOOKUP_RCU prevents us from performing a full check, return 1 * suggesting a reval is needed. * * Note that when creating a new file, or looking up a rename target, * then it shouldn't be necessary to revalidate a negative dentry. */ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; /* Case insensitive server? Revalidate negative dentries */ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) return 1; return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } static int nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, struct inode *inode, int error) { switch (error) { case 1: break; case -ETIMEDOUT: if (inode && (IS_ROOT(dentry) || NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)) error = 1; break; case -ESTALE: case -ENOENT: error = 0; fallthrough; default: /* * We can't d_drop the root of a disconnected tree: * its d_hash is on the s_anon list and d_drop() would hide * it from shrink_dcache_for_unmount(), leading to busy * inodes on unmount and further oopses. */ if (inode && IS_ROOT(dentry)) error = 1; break; } trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error); return error; } static int nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry, unsigned int flags) { int ret = 1; if (nfs_neg_need_reval(dir, dentry, flags)) { if (flags & LOOKUP_RCU) return -ECHILD; ret = 0; } return nfs_lookup_revalidate_done(dir, dentry, NULL, ret); } static int nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, struct inode *inode) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return nfs_lookup_revalidate_done(dir, dentry, inode, 1); } static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name, struct dentry *dentry, struct inode *inode, unsigned int flags) { struct nfs_fh *fhandle; struct nfs_fattr *fattr; unsigned long dir_verifier; int ret; trace_nfs_lookup_revalidate_enter(dir, dentry, flags); ret = -ENOMEM; fhandle = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fhandle == NULL || fattr == NULL) goto out; dir_verifier = nfs_save_change_attribute(dir); ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr); if (ret < 0) goto out; /* Request help from readdirplus */ nfs_lookup_advise_force_readdirplus(dir, flags); ret = 0; if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out; if (nfs_refresh_inode(inode, fattr) < 0) goto out; nfs_setsecurity(inode, fattr); nfs_set_verifier(dentry, dir_verifier); ret = 1; out: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); /* * If the lookup failed despite the dentry change attribute being * a match, then we should revalidate the directory cache. */ if (!ret && nfs_dentry_verify_change(dir, dentry)) nfs_mark_dir_for_revalidate(dir); return nfs_lookup_revalidate_done(dir, dentry, inode, ret); } /* * This is called every time the dcache has a lookup hit, * and we should check whether we can really trust that * lookup. * * NOTE! The hit can be a negative hit too, don't assume * we have an inode! * * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ static int nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct inode *inode; int error = 0; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); if (!inode) return nfs_lookup_revalidate_negative(dir, dentry, flags); if (is_bad_inode(inode)) { dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); goto out_bad; } if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 && nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) goto out_bad; if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { error = nfs_lookup_verify_inode(inode, flags); if (error) { if (error == -ESTALE) nfs_mark_dir_for_revalidate(dir); goto out_bad; } goto out_valid; } if (flags & LOOKUP_RCU) return -ECHILD; if (NFS_STALE(inode)) goto out_bad; return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); out_valid: return nfs_lookup_revalidate_done(dir, dentry, inode, 1); out_bad: if (flags & LOOKUP_RCU) return -ECHILD; return nfs_lookup_revalidate_done(dir, dentry, inode, error); } static int __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) { if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) return -ECHILD; } else { /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, smp_load_acquire(&dentry->d_fsdata) != NFS_FSDATA_BLOCKED); } return 0; } static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (__nfs_lookup_revalidate(dentry, flags)) return -ECHILD; return nfs_do_lookup_revalidate(dir, name, dentry, flags); } static void block_revalidate(struct dentry *dentry) { /* old devname - just in case */ kfree(dentry->d_fsdata); /* Any new reference that could lead to an open * will take ->d_lock in lookup_open() -> d_lookup(). * Holding this lock ensures we cannot race with * __nfs_lookup_revalidate() and removes and need * for further barriers. */ lockdep_assert_held(&dentry->d_lock); dentry->d_fsdata = NFS_FSDATA_BLOCKED; } static void unblock_revalidate(struct dentry *dentry) { store_release_wake_up(&dentry->d_fsdata, NULL); } /* * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a * pathwalk ends on a dentry that was not found via a normal lookup in the * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). * * In this situation, we just want to verify that the inode itself is OK * since the dentry might have changed on the server. */ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode = d_inode(dentry); int error = 0; /* * I believe we can only get a negative dentry here in the case of a * procfs-style symlink. Just assume it's correct for now, but we may * eventually need to do something more here. */ if (!inode) { dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n", __func__, dentry); return 1; } if (is_bad_inode(inode)) { dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); return 0; } error = nfs_lookup_verify_inode(inode, flags); dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; } /* * This is called from dput() when d_count is going to 0. */ static int nfs_dentry_delete(const struct dentry *dentry) { dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n", dentry, dentry->d_flags); /* Unhash any dentry with a stale inode */ if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry))) return 1; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { /* Unhash it, so that ->d_iput() would be called */ return 1; } if (!(dentry->d_sb->s_flags & SB_ACTIVE)) { /* Unhash it, so that ancestors of killed async unlink * files will be cleaned up during umount */ return 1; } return 0; } /* Ensure that we revalidate inode->i_nlink */ static void nfs_drop_nlink(struct inode *inode) { spin_lock(&inode->i_lock); /* drop the inode if we're reasonably sure this is the last link */ if (inode->i_nlink > 0) drop_nlink(inode); NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid( inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_NLINK); spin_unlock(&inode->i_lock); } /* * Called when the dentry loses inode. * We use it to clean up silly-renamed files. */ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) { if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { nfs_complete_unlink(dentry, inode); nfs_drop_nlink(inode); } iput(inode); } static void nfs_d_release(struct dentry *dentry) { /* free cached devname value, if it survived that far */ if (unlikely(dentry->d_fsdata)) { if (dentry->d_flags & DCACHE_NFSFS_RENAMED) WARN_ON(1); else kfree(dentry->d_fsdata); } } const struct dentry_operations nfs_dentry_operations = { .d_revalidate = nfs_lookup_revalidate, .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, .d_release = nfs_d_release, }; EXPORT_SYMBOL_GPL(nfs_dentry_operations); struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; unsigned long dir_verifier; int error; dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen)) return ERR_PTR(-ENAMETOOLONG); /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ if (nfs_is_exclusive_create(dir, flags) || flags & LOOKUP_RENAME_TARGET) return NULL; res = ERR_PTR(-ENOMEM); fhandle = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(dir)); if (fhandle == NULL || fattr == NULL) goto out; dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, fhandle, fattr); if (error == -ENOENT) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); goto no_entry; } if (error < 0) { res = ERR_PTR(error); goto out; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr); res = ERR_CAST(inode); if (IS_ERR(res)) goto out; /* Notify readdir to use READDIRPLUS */ nfs_lookup_advise_force_readdirplus(dir, flags); no_entry: res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) goto out; dentry = res; } nfs_set_verifier(dentry, dir_verifier); out: trace_nfs_lookup_exit(dir, dentry, flags, PTR_ERR_OR_ZERO(res)); nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); return res; } EXPORT_SYMBOL_GPL(nfs_lookup); void nfs_d_prune_case_insensitive_aliases(struct inode *inode) { /* Case insensitive server? Revalidate dentries */ if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE)) d_prune_aliases(inode); } EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); #if IS_ENABLED(CONFIG_NFS_V4) static int nfs4_lookup_revalidate(struct inode *, const struct qstr *, struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, .d_release = nfs_d_release, }; EXPORT_SYMBOL_GPL(nfs4_dentry_operations); static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); } static int do_open(struct inode *inode, struct file *filp) { nfs_fscache_open_file(inode, filp); return 0; } static int nfs_finish_open(struct nfs_open_context *ctx, struct dentry *dentry, struct file *file, unsigned open_flags) { int err; err = finish_open(file, dentry, do_open); if (err) goto out; if (S_ISREG(file_inode(file)->i_mode)) nfs_file_set_open_context(file, ctx); else err = -EOPENSTALE; out: return err; } int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; unsigned long dir_verifier; bool switched = false; int created = 0; int err; /* Expect a negative dentry */ BUG_ON(d_inode(dentry)); dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); err = nfs_check_flags(open_flags); if (err) return err; /* NFS only supports OPEN on regular files */ if ((open_flags & O_DIRECTORY)) { if (!d_in_lookup(dentry)) { /* * Hashed negative dentry with O_DIRECTORY: dentry was * revalidated and is fine, no need to perform lookup * again */ return -ENOENT; } lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY; goto no_open; } if (dentry->d_name.len > NFS_SERVER(dir)->namelen) return -ENAMETOOLONG; if (open_flags & O_CREAT) { struct nfs_server *server = NFS_SERVER(dir); if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) mode &= ~current_umask(); attr.ia_valid |= ATTR_MODE; attr.ia_mode = mode; } if (open_flags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; } if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) { d_drop(dentry); switched = true; dentry = d_alloc_parallel(dentry->d_parent, &dentry->d_name, &wq); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (unlikely(!d_in_lookup(dentry))) return finish_no_open(file, dentry); } ctx = create_nfs_open_context(dentry, open_flags, file); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; trace_nfs_atomic_open_enter(dir, ctx, open_flags); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, &created); if (created) file->f_mode |= FMODE_CREATED; if (IS_ERR(inode)) { err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); d_drop(dentry); switch (err) { case -ENOENT: d_splice_alias(NULL, dentry); if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); else dir_verifier = nfs_save_change_attribute(dir); nfs_set_verifier(dentry, dir_verifier); break; case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(open_flags & O_NOFOLLOW)) goto no_open; break; /* case -EINVAL: */ default: break; } goto out; } file->f_mode |= FMODE_CAN_ODIRECT; err = nfs_finish_open(ctx, ctx->dentry, file, open_flags); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); out: if (unlikely(switched)) { d_lookup_done(dentry); dput(dentry); } return err; no_open: res = nfs_lookup(dir, dentry, lookup_flags); if (!res) { inode = d_inode(dentry); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) res = ERR_PTR(-ENOTDIR); else if (inode && S_ISREG(inode->i_mode)) res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { dput(res); res = ERR_PTR(-ENOTDIR); } else if (inode && S_ISREG(inode->i_mode)) { dput(res); res = ERR_PTR(-EOPENSTALE); } } if (switched) { d_lookup_done(dentry); if (!res) res = dentry; else dput(dentry); } if (IS_ERR(res)) return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); static int nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct inode *inode; if (__nfs_lookup_revalidate(dentry, flags)) return -ECHILD; trace_nfs_lookup_revalidate_enter(dir, dentry, flags); if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) goto full_reval; if (d_mountpoint(dentry)) goto full_reval; inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ if (inode == NULL) goto full_reval; if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) goto full_reval; /* We cannot do exclusive creation on a positive dentry */ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL)) goto reval_dentry; /* Check if the directory changed */ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) goto reval_dentry; /* Let f_op->open() actually open (and revalidate) the file */ return 1; reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); full_reval: return nfs_do_lookup_revalidate(dir, name, dentry, flags); } #endif /* CONFIG_NFSV4 */ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode) { /* Same as look+open from lookup_open(), but with different O_TRUNC * handling. */ int error = 0; if (dentry->d_name.len > NFS_SERVER(dir)->namelen) return -ENAMETOOLONG; if (open_flags & O_CREAT) { file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); if (error) return error; return finish_open(file, dentry, NULL); } else if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and * we want those to be zero so the lookup isn't skipped. */ struct dentry *res = nfs_lookup(dir, dentry, 0); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) return PTR_ERR(res); return finish_no_open(file, res); } } return finish_no_open(file, NULL); } EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); struct dentry * nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; struct dentry *d; int error; d_drop(dentry); if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, fhandle, fattr); if (error) goto out_error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); if (error < 0) goto out_error; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr); d = d_splice_alias(inode, dentry); out: dput(parent); return d; out_error: d = ERR_PTR(error); goto out; } EXPORT_SYMBOL_GPL(nfs_add_or_obtain); /* * Code common to create, mkdir, and mknod. */ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct dentry *d; d = nfs_add_or_obtain(dentry, fhandle, fattr); if (IS_ERR(d)) return PTR_ERR(d); /* Callers don't care */ dput(d); return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); /* * Following a failed create operation, we drop the dentry rather * than retain a negative dentry. This avoids a problem in the event * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ static int nfs_do_create(struct inode *dir, struct dentry *dentry, umode_t mode, int open_flags) { struct iattr attr; int error; open_flags |= O_CREAT; dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; if (open_flags & O_TRUNC) { attr.ia_size = 0; attr.ia_valid |= ATTR_SIZE; } trace_nfs_create_enter(dir, dentry, open_flags); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); trace_nfs_create_exit(dir, dentry, open_flags, error); if (error != 0) goto out_err; return 0; out_err: d_drop(dentry); return error; } int nfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0); } EXPORT_SYMBOL_GPL(nfs_create); /* * See comments for nfs_proc_create regarding failed operations. */ int nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; trace_nfs_mknod_enter(dir, dentry); status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); trace_nfs_mknod_exit(dir, dentry, status); if (status != 0) goto out_err; return 0; out_err: d_drop(dentry); return status; } EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; struct dentry *ret; dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; attr.ia_mode = mode | S_IFDIR; trace_nfs_mkdir_enter(dir, dentry); ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret)); return ret; } EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { if (simple_positive(dentry)) d_delete(dentry); } static void nfs_dentry_remove_handle_error(struct inode *dir, struct dentry *dentry, int error) { switch (error) { case -ENOENT: if (d_really_is_positive(dentry)) d_delete(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; case 0: nfs_d_prune_case_insensitive_aliases(d_inode(dentry)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } } int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); if (d_really_is_positive(dentry)) { down_write(&NFS_I(d_inode(dentry))->rmdir_sem); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ switch (error) { case 0: clear_nlink(d_inode(dentry)); break; case -ENOENT: nfs_dentry_handle_enoent(dentry); } up_write(&NFS_I(d_inode(dentry))->rmdir_sem); } else error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); nfs_dentry_remove_handle_error(dir, dentry, error); trace_nfs_rmdir_exit(dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_rmdir); /* * Remove a file after making sure there are no pending writes, * and after checking that the file has only one user. * * We invalidate the attribute cache and free the inode prior to the operation * to avoid possible races if the server reuses the inode. */ static int nfs_safe_remove(struct dentry *dentry) { struct inode *dir = d_inode(dentry->d_parent); struct inode *inode = d_inode(dentry); int error = -EBUSY; dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry); /* If the dentry was sillyrenamed, we simply call d_delete() */ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { error = 0; goto out; } trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { error = NFS_PROTO(dir)->remove(dir, dentry); if (error == 0) nfs_drop_nlink(inode); } else error = NFS_PROTO(dir)->remove(dir, dentry); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); trace_nfs_remove_exit(dir, dentry, error); out: return error; } /* We do silly rename. In case sillyrename() returns -EBUSY, the inode * belongs to an active ".nfs..." file and we return -EBUSY. * * If sillyrename() returns 0, we do nothing, otherwise we unlink. */ int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(d_inode(dentry))->flags)) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); error = nfs_sillyrename(dir, dentry); goto out; } /* We must prevent any concurrent open until the unlink * completes. ->d_revalidate will wait for ->d_fsdata * to clear. We set it here to ensure no lookup succeeds until * the unlink is complete on the server. */ error = -ETXTBSY; if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) { spin_unlock(&dentry->d_lock); goto out; } block_revalidate(dentry); spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); unblock_revalidate(dentry); out: trace_nfs_unlink_exit(dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_unlink); /* * To create a symbolic link, most file systems instantiate a new inode, * add a page to it containing the path, then write it out to the disk * using prepare_write/commit_write. * * Unfortunately the NFS client can't create the in-core inode first * because it needs a file handle to create an in-core inode (see * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the * symlink request has completed on the server. * * So instead we allocate a raw page, copy the symname into it, then do * the SYMLINK request with the page as the buffer. If it succeeds, we * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct folio *folio; char *kaddr; struct iattr attr; unsigned int pathlen = strlen(symname); int error; dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) return -ENAMETOOLONG; attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; folio = folio_alloc(GFP_USER, 0); if (!folio) return -ENOMEM; kaddr = folio_address(folio); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); trace_nfs_symlink_enter(dir, dentry); error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); folio_put(folio); return error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); /* * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0, GFP_KERNEL) == 0) { folio_mark_uptodate(folio); folio_unlock(folio); } folio_put(folio); return 0; } EXPORT_SYMBOL_GPL(nfs_symlink); int nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int error; dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", old_dentry, dentry); trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); if (S_ISREG(inode->i_mode)) nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); ihold(inode); d_add(dentry, inode); } trace_nfs_link_exit(inode, dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_link); static void nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) { struct dentry *new_dentry = data->new_dentry; unblock_revalidate(new_dentry); } static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, struct dentry *new_dentry) { struct nfs_server *server = NFS_SB(old_dentry->d_sb); if (old_dentry->d_parent != new_dentry->d_parent) return false; if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); return true; } /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a * different file handle for the same inode after a rename (e.g. when * moving to a different directory). A fail-safe method to do so would * be to look up old_dir/old_name, create a link to new_dir/new_name and * rename the old file using the sillyrename stuff. This way, the original * file in old_dir will go away when the last process iput()s the inode. * * FIXED. * * It actually works quite well. One needs to have the possibility for * at least one ".nfs..." file in each directory the file ever gets * moved or linked to which happens automagically with the new * implementation that only depends on the dcache stuff instead of * using the inode layer * * Unfortunately, things are a little more complicated than indicated * above. For a cross-directory move, we want to make sure we can get * rid of the old inode after the operation. This means there must be * no pending writes (if it's a file), and the use count must be 1. * If these conditions are met, we can drop the dentries before doing * the rename. */ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct dentry *dentry = NULL; struct rpc_task *task; bool must_unblock = false; int error = -EBUSY; if (flags) return -EINVAL; dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", old_dentry, new_dentry, d_count(new_dentry)); trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry); /* * For non-directories, check whether the target is busy and if so, * make a copy of the dentry and then do a silly-rename. If the * silly-rename succeeds, the copied dentry is hashed and becomes * the new target. */ if (new_inode && !S_ISDIR(new_inode->i_mode)) { /* We must prevent any concurrent open until the unlink * completes. ->d_revalidate will wait for ->d_fsdata * to clear. We set it here to ensure no lookup succeeds until * the unlink is complete on the server. */ error = -ETXTBSY; if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) goto out; spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { int err; spin_unlock(&new_dentry->d_lock); /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, &new_dentry->d_name); if (!dentry) goto out; /* silly-rename the existing target ... */ err = nfs_sillyrename(new_dir, new_dentry); if (err) goto out; new_dentry = dentry; new_inode = NULL; } else { block_revalidate(new_dentry); must_unblock = true; spin_unlock(&new_dentry->d_lock); } } if (S_ISREG(old_inode->i_mode) && nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { if (must_unblock) unblock_revalidate(new_dentry); error = PTR_ERR(task); goto out; } error = rpc_wait_for_completion_task(task); if (error != 0) { ((struct nfs_renamedata *)task->tk_calldata)->cancelled = 1; /* Paired with the atomic_dec_and_test() barrier in rpc_do_put_task() */ smp_wmb(); } else error = task->tk_status; rpc_put_task(task); /* Ensure the inode attributes are revalidated */ if (error == 0) { spin_lock(&old_inode->i_lock); NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid(old_inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); spin_unlock(&old_inode->i_lock); } out: trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); if (!error) { if (new_inode != NULL) nfs_drop_nlink(new_inode); /* * The d_move() should be here instead of in an async RPC completion * handler because we need the proper locks to move the dentry. If * we're interrupted by a signal, the async RPC completion handler * should mark the directories for revalidation. */ d_move(old_dentry, new_dentry); nfs_set_verifier(old_dentry, nfs_save_change_attribute(new_dir)); } else if (error == -ENOENT) nfs_dentry_handle_enoent(old_dentry); /* new dentry created? */ if (dentry) dput(dentry); return error; } EXPORT_SYMBOL_GPL(nfs_rename); static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); static atomic_long_t nfs_access_nr_entries; static unsigned long nfs_access_max_cachesize = 4*1024*1024; module_param(nfs_access_max_cachesize, ulong, 0644); MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length"); static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_group_info(entry->group_info); kfree_rcu(entry, rcu_head); smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); smp_mb__after_atomic(); } static void nfs_access_free_list(struct list_head *head) { struct nfs_access_entry *cache; while (!list_empty(head)) { cache = list_entry(head->next, struct nfs_access_entry, lru); list_del(&cache->lru); nfs_access_free_entry(cache); } } static unsigned long nfs_do_access_cache_scan(unsigned int nr_to_scan) { LIST_HEAD(head); struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; long freed = 0; spin_lock(&nfs_access_lru_lock); list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { struct inode *inode; if (nr_to_scan-- == 0) break; inode = &nfsi->vfs_inode; spin_lock(&inode->i_lock); if (list_empty(&nfsi->access_cache_entry_lru)) goto remove_lru_entry; cache = list_entry(nfsi->access_cache_entry_lru.next, struct nfs_access_entry, lru); list_move(&cache->lru, &head); rb_erase(&cache->rb_node, &nfsi->access_cache); freed++; if (!list_empty(&nfsi->access_cache_entry_lru)) list_move_tail(&nfsi->access_cache_inode_lru, &nfs_access_lru_list); else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); smp_mb__before_atomic(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); smp_mb__after_atomic(); } spin_unlock(&inode->i_lock); } spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); return freed; } unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { int nr_to_scan = sc->nr_to_scan; gfp_t gfp_mask = sc->gfp_mask; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; return nfs_do_access_cache_scan(nr_to_scan); } unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); } static void nfs_access_cache_enforce_limit(void) { long nr_entries = atomic_long_read(&nfs_access_nr_entries); unsigned long diff; unsigned int nr_to_scan; if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize) return; nr_to_scan = 100; diff = nr_entries - nfs_access_max_cachesize; if (diff < nr_to_scan) nr_to_scan = diff; nfs_do_access_cache_scan(nr_to_scan); } static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) { struct rb_root *root_node = &nfsi->access_cache; struct rb_node *n; struct nfs_access_entry *entry; /* Unhook entries from the cache */ while ((n = rb_first(root_node)) != NULL) { entry = rb_entry(n, struct nfs_access_entry, rb_node); rb_erase(n, root_node); list_move(&entry->lru, head); } nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; } void nfs_access_zap_cache(struct inode *inode) { LIST_HEAD(head); if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) return; /* Remove from global LRU init */ spin_lock(&nfs_access_lru_lock); if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) list_del_init(&NFS_I(inode)->access_cache_inode_lru); spin_lock(&inode->i_lock); __nfs_access_zap_cache(NFS_I(inode), &head); spin_unlock(&inode->i_lock); spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); } EXPORT_SYMBOL_GPL(nfs_access_zap_cache); static int access_cmp(const struct cred *a, const struct nfs_access_entry *b) { struct group_info *ga, *gb; int g; if (uid_lt(a->fsuid, b->fsuid)) return -1; if (uid_gt(a->fsuid, b->fsuid)) return 1; if (gid_lt(a->fsgid, b->fsgid)) return -1; if (gid_gt(a->fsgid, b->fsgid)) return 1; ga = a->group_info; gb = b->group_info; if (ga == gb) return 0; if (ga == NULL) return -1; if (gb == NULL) return 1; if (ga->ngroups < gb->ngroups) return -1; if (ga->ngroups > gb->ngroups) return 1; for (g = 0; g < ga->ngroups; g++) { if (gid_lt(ga->gid[g], gb->gid[g])) return -1; if (gid_gt(ga->gid[g], gb->gid[g])) return 1; } return 0; } static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred) { struct rb_node *n = NFS_I(inode)->access_cache.rb_node; while (n != NULL) { struct nfs_access_entry *entry = rb_entry(n, struct nfs_access_entry, rb_node); int cmp = access_cmp(cred, entry); if (cmp < 0) n = n->rb_left; else if (cmp > 0) n = n->rb_right; else return entry; } return NULL; } static u64 nfs_access_login_time(const struct task_struct *task, const struct cred *cred) { const struct task_struct *parent; const struct cred *pcred; u64 ret; rcu_read_lock(); for (;;) { parent = rcu_dereference(task->real_parent); pcred = __task_cred(parent); if (parent == task || cred_fscmp(pcred, cred) != 0) break; task = parent; } ret = task->start_time; rcu_read_unlock(); return ret; } static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; bool retry = true; int err; spin_lock(&inode->i_lock); for(;;) { if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out_zap; cache = nfs_access_search_rbtree(inode, cred); err = -ENOENT; if (cache == NULL) goto out; /* Found an entry, is our attribute cache valid? */ if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) break; if (!retry) break; err = -ECHILD; if (!may_block) goto out; spin_unlock(&inode->i_lock); err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (err) return err; spin_lock(&inode->i_lock); retry = false; } err = -ENOENT; if ((s64)(login_time - cache->timestamp) > 0) goto out; *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; out: spin_unlock(&inode->i_lock); return err; out_zap: spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); return -ENOENT; } static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask) { /* Only check the most recently returned cache entry, * but do it without locking. */ struct nfs_inode *nfsi = NFS_I(inode); u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; int err = -ECHILD; struct list_head *lh; rcu_read_lock(); if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out; lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || access_cmp(cred, cache) != 0) cache = NULL; if (cache == NULL) goto out; if ((s64)(login_time - cache->timestamp) > 0) goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; *mask = cache->mask; err = 0; out: rcu_read_unlock(); return err; } int nfs_access_get_cached(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { int status; status = nfs_access_get_cached_rcu(inode, cred, mask); if (status != 0) status = nfs_access_get_cached_locked(inode, cred, mask, may_block); return status; } EXPORT_SYMBOL_GPL(nfs_access_get_cached); static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set, const struct cred *cred) { struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; struct rb_node **p = &root_node->rb_node; struct rb_node *parent = NULL; struct nfs_access_entry *entry; int cmp; spin_lock(&inode->i_lock); while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); cmp = access_cmp(cred, entry); if (cmp < 0) p = &parent->rb_left; else if (cmp > 0) p = &parent->rb_right; else goto found; } rb_link_node(&set->rb_node, parent, p); rb_insert_color(&set->rb_node, root_node); list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); spin_unlock(&inode->i_lock); return; found: rb_replace_node(parent, &set->rb_node, root_node); list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); list_del(&entry->lru); spin_unlock(&inode->i_lock); nfs_access_free_entry(entry); } void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, const struct cred *cred) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); cache->fsuid = cred->fsuid; cache->fsgid = cred->fsgid; cache->group_info = get_group_info(cred->group_info); cache->mask = set->mask; cache->timestamp = ktime_get_ns(); /* The above field assignments must be visible * before this item appears on the lru. We cannot easily * use rcu_assign_pointer, so just force the memory barrier. */ smp_wmb(); nfs_access_add_rbtree(inode, cache, cred); /* Update accounting */ smp_mb__before_atomic(); atomic_long_inc(&nfs_access_nr_entries); smp_mb__after_atomic(); /* Add inode to global LRU list */ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { spin_lock(&nfs_access_lru_lock); if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); spin_unlock(&nfs_access_lru_lock); } nfs_access_cache_enforce_limit(); } EXPORT_SYMBOL_GPL(nfs_access_add_cache); #define NFS_MAY_READ (NFS_ACCESS_READ) #define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \ NFS_ACCESS_EXTEND | \ NFS_ACCESS_DELETE) #define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \ NFS_ACCESS_EXTEND) #define NFS_DIR_MAY_WRITE NFS_MAY_WRITE #define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP) #define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE) static int nfs_access_calc_mask(u32 access_result, umode_t umode) { int mask = 0; if (access_result & NFS_MAY_READ) mask |= MAY_READ; if (S_ISDIR(umode)) { if ((access_result & NFS_DIR_MAY_WRITE) == NFS_DIR_MAY_WRITE) mask |= MAY_WRITE; if ((access_result & NFS_MAY_LOOKUP) == NFS_MAY_LOOKUP) mask |= MAY_EXEC; } else if (S_ISREG(umode)) { if ((access_result & NFS_FILE_MAY_WRITE) == NFS_FILE_MAY_WRITE) mask |= MAY_WRITE; if ((access_result & NFS_MAY_EXECUTE) == NFS_MAY_EXECUTE) mask |= MAY_EXEC; } else if (access_result & NFS_MAY_WRITE) mask |= MAY_WRITE; return mask; } void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) { entry->mask = access_result; } EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) { struct nfs_access_entry cache; bool may_block = (mask & MAY_NOT_BLOCK) == 0; int cache_mask = -1; int status; trace_nfs_access_enter(inode); status = nfs_access_get_cached(inode, cred, &cache.mask, may_block); if (status == 0) goto out_cached; status = -ECHILD; if (!may_block) goto out; /* * Determine which access bits we want to ask for... */ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND | nfs_access_xattr_mask(NFS_SERVER(inode)); if (S_ISDIR(inode->i_mode)) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else cache.mask |= NFS_ACCESS_EXECUTE; status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { if (!S_ISDIR(inode->i_mode)) nfs_set_inode_stale(inode); else nfs_zap_caches(inode); } goto out; } nfs_access_add_cache(inode, &cache, cred); out_cached: cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) status = -EACCES; out: trace_nfs_access_exit(inode, mask, cache_mask, status); return status; } static int nfs_open_permission_mask(int openflags) { int mask = 0; if (openflags & __FMODE_EXEC) { /* ONLY check exec rights */ mask = MAY_EXEC; } else { if ((openflags & O_ACCMODE) != O_WRONLY) mask |= MAY_READ; if ((openflags & O_ACCMODE) != O_RDONLY) mask |= MAY_WRITE; } return mask; } int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags) { return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } EXPORT_SYMBOL_GPL(nfs_may_open); static int nfs_execute_ok(struct inode *inode, int mask) { struct nfs_server *server = NFS_SERVER(inode); int ret = 0; if (S_ISDIR(inode->i_mode)) return 0; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) { if (mask & MAY_NOT_BLOCK) return -ECHILD; ret = __nfs_revalidate_inode(server, inode); } if (ret == 0 && !execute_ok(inode)) ret = -EACCES; return ret; } int nfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { const struct cred *cred = current_cred(); int res = 0; nfs_inc_stats(inode, NFSIOS_VFSACCESS); if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) goto out; /* Is this sys_access() ? */ if (mask & (MAY_ACCESS | MAY_CHDIR)) goto force_lookup; switch (inode->i_mode & S_IFMT) { case S_IFLNK: goto out; case S_IFREG: if ((mask & MAY_OPEN) && nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)) return 0; break; case S_IFDIR: /* * Optimize away all write operations, since the server * will check permissions when we perform the op. */ if ((mask & MAY_WRITE) && !(mask & MAY_READ)) goto out; } force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; res = nfs_do_access(inode, cred, mask); out: if (!res && (mask & MAY_EXEC)) res = nfs_execute_ok(inode, mask); dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: if (mask & MAY_NOT_BLOCK) return -ECHILD; res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER); if (res == 0) res = generic_permission(&nop_mnt_idmap, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); |
| 6 6 6 6 1 6 6 6 1 6 2 2 2 2 2 2 2 6 2 4 6 4 2 709 707 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | // SPDX-License-Identifier: GPL-2.0 /* * x86 single-step support code, common to 32-bit and 64-bit. */ #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/ptrace.h> #include <asm/desc.h> #include <asm/debugreg.h> #include <asm/mmu_context.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { unsigned long addr, seg; addr = regs->ip; seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; } #ifdef CONFIG_MODIFY_LDT_SYSCALL /* * We'll assume that the code segments in the GDT * are all zero-based. That is largely true: the * TLS segments are used for data, and the PNPBIOS * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { struct desc_struct *desc; unsigned long base; seg >>= 3; mutex_lock(&child->mm->context.lock); if (unlikely(!child->mm->context.ldt || seg >= child->mm->context.ldt->nr_entries)) addr = -1L; /* bogus selector, access would fault */ else { desc = &child->mm->context.ldt->entries[seg]; base = get_desc_base(desc); /* 16-bit code segment? */ if (!desc->d) addr &= 0xffff; addr += base; } mutex_unlock(&child->mm->context.lock); } #endif return addr; } static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; unsigned char opcode[15]; unsigned long addr = convert_ip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), FOLL_FORCE); for (i = 0; i < copied; i++) { switch (opcode[i]) { /* popf and iret */ case 0x9d: case 0xcf: return 1; /* CHECKME: 64 65 */ /* opcode and address size prefixes */ case 0x66: case 0x67: continue; /* irrelevant prefixes (segment overrides and repeats) */ case 0x26: case 0x2e: case 0x36: case 0x3e: case 0x64: case 0x65: case 0xf0: case 0xf2: case 0xf3: continue; #ifdef CONFIG_X86_64 case 0x40 ... 0x4f: if (!user_64bit_mode(regs)) /* 32-bit mode: register increment */ return 0; /* 64-bit mode: REX prefix */ continue; #endif /* CHECKME: f2, f3 */ /* * pushf: NOTE! We should probably not let * the user see the TF bit being set. But * it's more pain than it's worth to avoid * it, and a debugger could emulate this * all in user space if it _really_ cares. */ case 0x9c: default: return 0; } } return 0; } /* * Enable single-stepping. Return nonzero if user mode is not using TF itself. */ static int enable_single_step(struct task_struct *child) { struct pt_regs *regs = task_pt_regs(child); unsigned long oflags; /* * If we stepped into a sysenter/syscall insn, it trapped in * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. * If user-mode had set TF itself, then it's still clear from * do_debug() and we need to set it again to restore the user * state so we don't wrongly set TIF_FORCED_TF below. * If enable_single_step() was used last and that is what * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are * already set and our bookkeeping is fine. */ if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) regs->flags |= X86_EFLAGS_TF; /* * Always set TIF_SINGLESTEP. This will also * cause us to set TF when returning to user mode. */ set_tsk_thread_flag(child, TIF_SINGLESTEP); /* * Ensure that a trap is triggered once stepping out of a system * call prior to executing any user instruction. */ set_task_syscall_work(child, SYSCALL_EXIT_TRAP); oflags = regs->flags; /* Set TF on the kernel stack.. */ regs->flags |= X86_EFLAGS_TF; /* * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. * * Note that if we don't actually execute the popf because * of a signal arriving right now or suchlike, we will lose * track of the fact that it really was "us" that set it. */ if (is_setting_trap_flag(child, regs)) { clear_tsk_thread_flag(child, TIF_FORCED_TF); return 0; } /* * If TF was already set, check whether it was us who set it. * If not, we should never attempt a block step. */ if (oflags & X86_EFLAGS_TF) return test_tsk_thread_flag(child, TIF_FORCED_TF); set_tsk_thread_flag(child, TIF_FORCED_TF); return 1; } void set_task_blockstep(struct task_struct *task, bool on) { unsigned long debugctl; /* * Ensure irq/preemption can't change debugctl in between. * Note also that both TIF_BLOCKSTEP and debugctl should * be changed atomically wrt preemption. * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); if (on) { debugctl |= DEBUGCTLMSR_BTF; set_tsk_thread_flag(task, TIF_BLOCKSTEP); } else { debugctl &= ~DEBUGCTLMSR_BTF; clear_tsk_thread_flag(task, TIF_BLOCKSTEP); } if (task == current) update_debugctlmsr(debugctl); local_irq_enable(); } /* * Enable single or block step. */ static void enable_step(struct task_struct *child, bool block) { /* * Make sure block stepping (BTF) is not enabled unless it should be. * Note that we don't try to worry about any is_setting_trap_flag() * instructions after the first when using block stepping. * So no one should try to use debugger block stepping in a program * that uses user-mode single stepping itself. */ if (enable_single_step(child) && block) set_task_blockstep(child, true); else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) set_task_blockstep(child, false); } void user_enable_single_step(struct task_struct *child) { enable_step(child, 0); } void user_enable_block_step(struct task_struct *child) { enable_step(child, 1); } void user_disable_single_step(struct task_struct *child) { /* * Make sure block stepping (BTF) is disabled. */ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) set_task_blockstep(child, false); /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); /* But touch TF only if it was set by us.. */ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; } |
| 11 6 6 2 11 6 11 7 7 7 7 8 8 6 6 6 4 3 2 6 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Weighted Round-Robin Scheduling module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest * Wensong Zhang : changed some comestics things for debugging * Wensong Zhang : changed for the d-linked destination list * Wensong Zhang : added the ip_vs_wrr_update_svc * Julian Anastasov : fixed the bug of returning destination * with weight 0 when all weights are zero */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/gcd.h> #include <net/ip_vs.h> /* The WRR algorithm depends on some caclulations: * - mw: maximum weight * - di: weight step, greatest common divisor from all weights * - cw: current required weight * As result, all weights are in the [di..mw] range with a step=di. * * First, we start with cw = mw and select dests with weight >= cw. * Then cw is reduced with di and all dests are checked again. * Last pass should be with cw = di. We have mw/di passes in total: * * pass 1: cw = max weight * pass 2: cw = max weight - di * pass 3: cw = max weight - 2 * di * ... * last pass: cw = di * * Weights are supposed to be >= di but we run in parallel with * weight changes, it is possible some dest weight to be reduced * below di, bad if it is the only available dest. * * So, we modify how mw is calculated, now it is reduced with (di - 1), * so that last cw is 1 to catch such dests with weight below di: * pass 1: cw = max weight - (di - 1) * pass 2: cw = max weight - di - (di - 1) * pass 3: cw = max weight - 2 * di - (di - 1) * ... * last pass: cw = 1 * */ /* * current destination pointer for weighted round-robin scheduling */ struct ip_vs_wrr_mark { struct ip_vs_dest *cl; /* current dest or head */ int cw; /* current weight */ int mw; /* maximum weight */ int di; /* decreasing interval */ struct rcu_head rcu_head; }; static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) { struct ip_vs_dest *dest; int weight; int g = 0; list_for_each_entry(dest, &svc->destinations, n_list) { weight = atomic_read(&dest->weight); if (weight > 0) { if (g > 0) g = gcd(weight, g); else g = weight; } } return g ? g : 1; } /* * Get the maximum weight of the service destinations. */ static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) { struct ip_vs_dest *dest; int new_weight, weight = 0; list_for_each_entry(dest, &svc->destinations, n_list) { new_weight = atomic_read(&dest->weight); if (new_weight > weight) weight = new_weight; } return weight; } static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) { struct ip_vs_wrr_mark *mark; /* * Allocate the mark variable for WRR scheduling */ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); if (mark == NULL) return -ENOMEM; mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); mark->cw = mark->mw; svc->sched_data = mark; return 0; } static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) { struct ip_vs_wrr_mark *mark = svc->sched_data; /* * Release the mark variable */ kfree_rcu(mark, rcu_head); } static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct ip_vs_wrr_mark *mark = svc->sched_data; spin_lock_bh(&svc->sched_lock); mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); if (mark->cw > mark->mw || !mark->cw) mark->cw = mark->mw; else if (mark->di > 1) mark->cw = (mark->cw / mark->di) * mark->di + 1; spin_unlock_bh(&svc->sched_lock); return 0; } /* * Weighted Round-Robin Scheduling */ static struct ip_vs_dest * ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *last, *stop = NULL; struct ip_vs_wrr_mark *mark = svc->sched_data; bool last_pass = false, restarted = false; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); spin_lock_bh(&svc->sched_lock); dest = mark->cl; /* No available dests? */ if (mark->mw == 0) goto err_noavail; last = dest; /* Stop only after all dests were checked for weight >= 1 (last pass) */ while (1) { list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) >= mark->cw) goto found; if (dest == stop) goto err_over; } mark->cw -= mark->di; if (mark->cw <= 0) { mark->cw = mark->mw; /* Stop if we tried last pass from first dest: * 1. last_pass: we started checks when cw > di but * then all dests were checked for w >= 1 * 2. last was head: the first and only traversal * was for weight >= 1, for all dests. */ if (last_pass || &last->n_list == &svc->destinations) goto err_over; restarted = true; } last_pass = mark->cw <= mark->di; if (last_pass && restarted && &last->n_list != &svc->destinations) { /* First traversal was for w >= 1 but only * for dests after 'last', now do the same * for all dests up to 'last'. */ stop = last; } } found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), refcount_read(&dest->refcnt), atomic_read(&dest->weight)); mark->cl = dest; out: spin_unlock_bh(&svc->sched_lock); return dest; err_noavail: mark->cl = dest; dest = NULL; ip_vs_scheduler_err(svc, "no destination available"); goto out; err_over: mark->cl = dest; dest = NULL; ip_vs_scheduler_err(svc, "no destination available: " "all destinations are overloaded"); goto out; } static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .name = "wrr", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, .add_dest = ip_vs_wrr_dest_changed, .del_dest = ip_vs_wrr_dest_changed, .upd_dest = ip_vs_wrr_dest_changed, .schedule = ip_vs_wrr_schedule, }; static int __init ip_vs_wrr_init(void) { return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; } static void __exit ip_vs_wrr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); synchronize_rcu(); } module_init(ip_vs_wrr_init); module_exit(ip_vs_wrr_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs weighted round-robin scheduler"); |
| 49 2 14 3 106 17 52 14 29 17 27 30 3 19 23 4 5 5 1 2 18 7 18 11 2 16 11 13 8 65 86 19 4 9 9 107 107 107 107 107 107 107 107 3 3 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GRO_H #define _NET_GRO_H #include <linux/indirect_call_wrapper.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <linux/skbuff.h> #include <net/udp.h> #include <net/hotdata.h> /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) struct napi_gro_cb { union { struct { /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ void *frag0; /* Length of frag0. */ unsigned int frag0_len; }; struct { /* used in skb_gro_receive() slow path */ struct sk_buff *last; /* jiffies when first packet was created/queued */ unsigned long age; }; }; /* This indicates where we are processing relative to skb->data. */ int data_offset; /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush; /* Number of segments aggregated. */ u16 count; /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto; u16 pad; /* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 /* portion of the cb set to zero at every gro iteration */ struct_group(zeroed, /* Start offset for remote checksum offload */ u16 gro_remcsum_start; /* This is non-zero if the packet may be of the same flow. */ u8 same_flow:1; /* Used in tunnel GRO receive */ u8 encap_mark:1; /* GRO checksum is valid */ u8 csum_valid:1; /* Number of checksums via CHECKSUM_UNNECESSARY */ u8 csum_cnt:3; /* Free the skb? */ u8 free:2; /* Used in foo-over-udp, set in udp[46]_gro_receive */ u8 is_ipv6:1; /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ u8 ip_fixedid:1; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; /* GRO is done by frag_list pointer chaining. */ u8 is_flist:1; ); /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; /* L3 offsets */ union { struct { u16 network_offset; u16 inner_network_offset; }; u16 network_offsets[2]; }; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) #define GRO_RECURSION_LIMIT 15 static inline int gro_recursion_inc_test(struct sk_buff *skb) { return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive(gro_receive_t cb, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(head, skb); } typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, struct sock *sk, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(sk, head, skb); } static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->data_offset; } static inline unsigned int skb_gro_len(const struct sk_buff *skb) { return skb->len - NAPI_GRO_CB(skb)->data_offset; } static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) { NAPI_GRO_CB(skb)->data_offset += len; } static inline void *skb_gro_header_fast(const struct sk_buff *skb, unsigned int offset) { return NAPI_GRO_CB(skb)->frag0 + offset; } static inline bool skb_gro_may_pull(const struct sk_buff *skb, unsigned int hlen) { return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len); } static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; return skb->data + offset; } static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { void *ptr; ptr = skb_gro_header_fast(skb, offset); if (!skb_gro_may_pull(skb, hlen)) ptr = skb_gro_header_slow(skb, hlen, offset); return ptr; } static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; } static inline void *skb_gro_network_header(const struct sk_buff *skb) { if (skb_gro_may_pull(skb, skb_gro_offset(skb))) return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb)); return skb->data + skb_gro_receive_network_offset(skb); } static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct iphdr *iph = skb_gro_network_header(skb); return csum_tcpudp_nofold(iph->saddr, iph->daddr, skb_gro_len(skb), proto, 0); } static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (NAPI_GRO_CB(skb)->csum_valid) NAPI_GRO_CB(skb)->csum = wsum_negate(csum_partial(start, len, wsum_negate(NAPI_GRO_CB(skb)->csum))); } /* GRO checksum functions. These are logical equivalents of the normal * checksum functions (in skbuff.h) except that they operate on the GRO * offsets and fields in sk_buff. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb); static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); } static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { return ((skb->ip_summed != CHECKSUM_PARTIAL || skb_checksum_start_offset(skb) < skb_gro_offset(skb)) && !skb_at_gro_remcsum_start(skb) && NAPI_GRO_CB(skb)->csum_cnt == 0 && (!zero_okay || check)); } static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, __wsum psum) { if (NAPI_GRO_CB(skb)->csum_valid && !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) return 0; NAPI_GRO_CB(skb)->csum = psum; return __skb_gro_checksum_complete(skb); } static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->csum_cnt > 0) { /* Consume a checksum from CHECKSUM_UNNECESSARY */ NAPI_GRO_CB(skb)->csum_cnt--; } else { /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we * verified a new top level checksum or an encapsulated one * during GRO. This saves work if we fallback to normal path. */ __skb_incr_checksum_unnecessary(skb); } } #define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ compute_pseudo) \ ({ \ __sum16 __ret = 0; \ if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ if (!__ret) \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) #define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) #define skb_gro_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) #define skb_gro_checksum_simple_validate(skb) \ __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid); } static inline void __skb_gro_checksum_convert(struct sk_buff *skb, __wsum pseudo) { NAPI_GRO_CB(skb)->csum = ~pseudo; NAPI_GRO_CB(skb)->csum_valid = 1; } #define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_gro_checksum_convert_check(skb)) \ __skb_gro_checksum_convert(skb, \ compute_pseudo(skb, proto)); \ } while (0) struct gro_remcsum { int offset; __wsum delta; }; static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) { grc->offset = 0; grc->delta = 0; } static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, unsigned int off, size_t hdrlen, int start, int offset, struct gro_remcsum *grc, bool nopartial) { __wsum delta; size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); if (!nopartial) { NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; return ptr; } ptr = skb_gro_header(skb, off + plen, off); if (!ptr) return NULL; delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, start, offset); /* Adjust skb->csum since we changed the packet */ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); grc->offset = off + hdrlen + offset; grc->delta = delta; return ptr; } static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, struct gro_remcsum *grc) { void *ptr; size_t plen = grc->offset + sizeof(u16); if (!grc->delta) return; ptr = skb_gro_header(skb, plen, grc->offset); if (!ptr) return; remcsum_unadjust((__sum16 *)ptr, grc->delta); } #ifdef CONFIG_XFRM_OFFLOAD static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { if (PTR_ERR(pp) != -EINPROGRESS) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } } #else static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } #endif INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); #define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_INET(cb, f2, f1, head, skb); \ }) struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header(skb, hlen, off); return uh; } static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, skb_gro_len(skb), proto, 0)); } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, struct sk_buff *p, bool outer) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; const u32 df = id & IP_DF; int flush; /* All fields must match except length and checksum. */ flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); if (flush | (outer && df)) return flush; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ if (count == 1 && df && !ipid_offset) NAPI_GRO_CB(p)->ip_fixedid = true; return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) { /* <Version:4><Traffic_Class:8><Flow_Label:20> */ __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* Flush if Traffic Class fields are different. */ return !!((first_word & htonl(0x0FF00000)) | (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); } static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, bool outer) { const void *nh = th - diff; const void *nh2 = th2 - diff; if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else return inet_gro_flush(nh, nh2, p, outer); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; int off = skb_transport_offset(p); int flush; flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); if (encap_mark) flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); return flush; } int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); void __gro_flush(struct gro_node *gro, bool flush_old); static inline void gro_flush(struct gro_node *gro, bool flush_old) { if (!gro->bitmask) return; __gro_flush(gro, flush_old); } static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old) { gro_flush(&napi->gro, flush_old); } /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ static inline void gro_normal_list(struct gro_node *gro) { if (!gro->rx_count) return; netif_receive_skb_list_internal(&gro->rx_list); INIT_LIST_HEAD(&gro->rx_list); gro->rx_count = 0; } static inline void gro_flush_normal(struct gro_node *gro, bool flush_old) { gro_flush(gro, flush_old); gro_normal_list(gro); } /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &gro->rx_list); gro->rx_count += segs; if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) gro_normal_list(gro); } void gro_init(struct gro_node *gro); void gro_cleanup(struct gro_node *gro); /* This function is the alternative of 'inet_iif' and 'inet_sdif' * functions in case we can not rely on fields of IPCB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { *iif = inet_iif(skb) ?: skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } /* This function is the alternative of 'inet6_iif' and 'inet6_sdif' * functions in case we can not rely on fields of IP6CB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ *iif = skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); #endif /* _NET_GRO_H */ |
| 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | // SPDX-License-Identifier: GPL-2.0+ /* * USB Networking Link Interface * * Copyright (C) 2000-2005 by David Brownell <dbrownell@users.sourceforge.net> * Copyright (C) 2003-2005 David Hollis <dhollis@davehollis.com> */ #ifndef __LINUX_USB_USBNET_H #define __LINUX_USB_USBNET_H #include <linux/mii.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/usb.h> /* interface from usbnet core to each USB networking link we handle */ struct usbnet { /* housekeeping */ struct usb_device *udev; struct usb_interface *intf; const struct driver_info *driver_info; const char *driver_name; void *driver_priv; wait_queue_head_t wait; struct mutex phy_mutex; unsigned char suspend_count; unsigned char pkt_cnt, pkt_err; unsigned short rx_qlen, tx_qlen; unsigned can_dma_sg:1; /* i/o info: pipes etc */ unsigned in, out; struct usb_host_endpoint *status; unsigned maxpacket; struct timer_list delay; const char *padding_pkt; /* protocol/interface state */ struct net_device *net; int msg_enable; unsigned long data[5]; u32 xid; u32 hard_mtu; /* count any extra framing */ size_t rx_urb_size; /* size for rx urbs */ struct mii_if_info mii; long rx_speed; /* If MII not used */ long tx_speed; /* If MII not used */ # define SPEED_UNSET -1 /* various kinds of pending driver work */ struct sk_buff_head rxq; struct sk_buff_head txq; struct sk_buff_head done; struct sk_buff_head rxq_pause; struct urb *interrupt; unsigned interrupt_count; struct mutex interrupt_mutex; struct usb_anchor deferred; struct work_struct bh_work; struct work_struct kevent; unsigned long flags; # define EVENT_TX_HALT 0 # define EVENT_RX_HALT 1 # define EVENT_RX_MEMORY 2 # define EVENT_STS_SPLIT 3 # define EVENT_LINK_RESET 4 # define EVENT_RX_PAUSED 5 # define EVENT_DEV_ASLEEP 6 # define EVENT_DEV_OPEN 7 # define EVENT_DEVICE_REPORT_IDLE 8 # define EVENT_NO_RUNTIME_PM 9 # define EVENT_RX_KILL 10 # define EVENT_LINK_CHANGE 11 # define EVENT_SET_RX_MODE 12 # define EVENT_NO_IP_ALIGN 13 # define EVENT_LINK_CARRIER_ON 14 /* This one is special, as it indicates that the device is going away * there are cyclic dependencies between tasklet, timer and bh * that must be broken */ # define EVENT_UNPLUG 31 }; static inline bool usbnet_going_away(struct usbnet *ubn) { return test_bit(EVENT_UNPLUG, &ubn->flags); } static inline void usbnet_mark_going_away(struct usbnet *ubn) { set_bit(EVENT_UNPLUG, &ubn->flags); } static inline struct usb_driver *driver_of(struct usb_interface *intf) { return to_usb_driver(intf->dev.driver); } /* interface from the device/framing level "minidriver" to core */ struct driver_info { char *description; int flags; /* framing is CDC Ethernet, not writing ZLPs (hw issues), or optionally: */ #define FLAG_FRAMING_NC 0x0001 /* guard against device dropouts */ #define FLAG_FRAMING_GL 0x0002 /* genelink batches packets */ #define FLAG_FRAMING_Z 0x0004 /* zaurus adds a trailer */ #define FLAG_FRAMING_RN 0x0008 /* RNDIS batches, plus huge header */ #define FLAG_NO_SETINT 0x0010 /* device can't set_interface() */ #define FLAG_ETHER 0x0020 /* maybe use "eth%d" names */ #define FLAG_FRAMING_AX 0x0040 /* AX88772/178 packets */ #define FLAG_WLAN 0x0080 /* use "wlan%d" names */ #define FLAG_AVOID_UNLINK_URBS 0x0100 /* don't unlink urbs at usbnet_stop() */ #define FLAG_SEND_ZLP 0x0200 /* hw requires ZLPs are sent */ #define FLAG_WWAN 0x0400 /* use "wwan%d" names */ #define FLAG_LINK_INTR 0x0800 /* updates link (carrier) status */ #define FLAG_POINTTOPOINT 0x1000 /* possibly use "usb%d" names */ /* * Indicates to usbnet, that USB driver accumulates multiple IP packets. * Affects statistic (counters) and short packet handling. */ #define FLAG_MULTI_PACKET 0x2000 #define FLAG_RX_ASSEMBLE 0x4000 /* rx packets may span >1 frames */ #define FLAG_NOARP 0x8000 /* device can't do ARP */ /* init device ... can sleep, or cause probe() failure */ int (*bind)(struct usbnet *, struct usb_interface *); /* cleanup device ... can sleep, but can't fail */ void (*unbind)(struct usbnet *, struct usb_interface *); /* reset device ... can sleep */ int (*reset)(struct usbnet *); /* stop device ... can sleep */ int (*stop)(struct usbnet *); /* see if peer is connected ... can sleep */ int (*check_connect)(struct usbnet *); /* (dis)activate runtime power management */ int (*manage_power)(struct usbnet *, int); /* for status polling */ void (*status)(struct usbnet *, struct urb *); /* link reset handling, called from defer_kevent */ int (*link_reset)(struct usbnet *); /* fixup rx packet (strip framing) */ int (*rx_fixup)(struct usbnet *dev, struct sk_buff *skb); /* fixup tx packet (add framing) */ struct sk_buff *(*tx_fixup)(struct usbnet *dev, struct sk_buff *skb, gfp_t flags); /* recover from timeout */ void (*recover)(struct usbnet *dev); /* early initialization code, can sleep. This is for minidrivers * having 'subminidrivers' that need to do extra initialization * right after minidriver have initialized hardware. */ int (*early_init)(struct usbnet *dev); /* called by minidriver when receiving indication */ void (*indication)(struct usbnet *dev, void *ind, int indlen); /* rx mode change (device changes address list filtering) */ void (*set_rx_mode)(struct usbnet *dev); /* for new devices, use the descriptor-reading code instead */ int in; /* rx endpoint */ int out; /* tx endpoint */ unsigned long data; /* Misc driver specific data */ }; /* Minidrivers are just drivers using the "usbnet" core as a powerful * network-specific subroutine library ... that happens to do pretty * much everything except custom framing and chip-specific stuff. */ extern int usbnet_probe(struct usb_interface *, const struct usb_device_id *); extern int usbnet_suspend(struct usb_interface *, pm_message_t); extern int usbnet_resume(struct usb_interface *); extern void usbnet_disconnect(struct usb_interface *); extern void usbnet_device_suggests_idle(struct usbnet *dev); extern int usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size); extern int usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size); extern int usbnet_read_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size); extern int usbnet_write_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size); extern int usbnet_write_cmd_async(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size); /* Drivers that reuse some of the standard USB CDC infrastructure * (notably, using multiple interfaces according to the CDC * union descriptor) get some helper code. */ struct cdc_state { struct usb_cdc_header_desc *header; struct usb_cdc_union_desc *u; struct usb_cdc_ether_desc *ether; struct usb_interface *control; struct usb_interface *data; }; extern void usbnet_cdc_update_filter(struct usbnet *dev); extern int usbnet_generic_cdc_bind(struct usbnet *, struct usb_interface *); extern int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf); extern int usbnet_cdc_bind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_unbind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_status(struct usbnet *, struct urb *); extern int usbnet_cdc_zte_rx_fixup(struct usbnet *dev, struct sk_buff *skb); /* CDC and RNDIS support the same host-chosen packet filters for IN transfers */ #define DEFAULT_FILTER (USB_CDC_PACKET_TYPE_BROADCAST \ |USB_CDC_PACKET_TYPE_ALL_MULTICAST \ |USB_CDC_PACKET_TYPE_PROMISCUOUS \ |USB_CDC_PACKET_TYPE_DIRECTED) /* we record the state for each of our queued skbs */ enum skb_state { illegal = 0, tx_start, tx_done, rx_start, rx_done, rx_cleanup, unlink_start }; struct skb_data { /* skb->cb is one of these */ struct urb *urb; struct usbnet *dev; enum skb_state state; long length; unsigned long packets; }; /* Drivers that set FLAG_MULTI_PACKET must call this in their * tx_fixup method before returning an skb. */ static inline void usbnet_set_skb_tx_stats(struct sk_buff *skb, unsigned long packets, long bytes_delta) { struct skb_data *entry = (struct skb_data *) skb->cb; entry->packets = packets; entry->length = bytes_delta; } extern int usbnet_open(struct net_device *net); extern int usbnet_stop(struct net_device *net); extern netdev_tx_t usbnet_start_xmit(struct sk_buff *skb, struct net_device *net); extern void usbnet_tx_timeout(struct net_device *net, unsigned int txqueue); extern int usbnet_change_mtu(struct net_device *net, int new_mtu); extern int usbnet_get_endpoints(struct usbnet *, struct usb_interface *); extern int usbnet_get_ethernet_addr(struct usbnet *, int); extern void usbnet_defer_kevent(struct usbnet *, int); extern void usbnet_skb_return(struct usbnet *, struct sk_buff *); extern void usbnet_unlink_rx_urbs(struct usbnet *); extern void usbnet_pause_rx(struct usbnet *); extern void usbnet_resume_rx(struct usbnet *); extern void usbnet_purge_paused_rxq(struct usbnet *); extern int usbnet_get_link_ksettings_mii(struct net_device *net, struct ethtool_link_ksettings *cmd); extern int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd); extern int usbnet_get_link_ksettings_internal(struct net_device *net, struct ethtool_link_ksettings *cmd); extern u32 usbnet_get_link(struct net_device *net); extern u32 usbnet_get_msglevel(struct net_device *); extern void usbnet_set_msglevel(struct net_device *, u32); extern void usbnet_set_rx_mode(struct net_device *net); extern void usbnet_get_drvinfo(struct net_device *, struct ethtool_drvinfo *); extern int usbnet_nway_reset(struct net_device *net); extern int usbnet_manage_power(struct usbnet *, int); extern void usbnet_link_change(struct usbnet *, bool, bool); extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags); extern void usbnet_status_stop(struct usbnet *dev); extern void usbnet_update_max_qlen(struct usbnet *dev); #endif /* __LINUX_USB_USBNET_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/arch/x86_64/mm/init.c * * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> */ #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/initrd.h> #include <linux/pagemap.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/pci.h> #include <linux/pfn.h> #include <linux/poison.h> #include <linux/dma-mapping.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> #include <linux/memremap.h> #include <linux/nmi.h> #include <linux/gfp.h> #include <linux/kcore.h> #include <linux/bootmem_info.h> #include <asm/processor.h> #include <asm/bios_ebda.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/dma.h> #include <asm/fixmap.h> #include <asm/e820/api.h> #include <asm/apic.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/smp.h> #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/numa.h> #include <asm/set_memory.h> #include <asm/init.h> #include <asm/uv/uv.h> #include <asm/setup.h> #include <asm/ftrace.h> #include "mm_internal.h" #include "ident_map.c" #define DEFINE_POPULATE(fname, type1, type2, init) \ static inline void fname##_init(struct mm_struct *mm, \ type1##_t *arg1, type2##_t *arg2, bool init) \ { \ if (init) \ fname##_safe(mm, arg1, arg2); \ else \ fname(mm, arg1, arg2); \ } DEFINE_POPULATE(p4d_populate, p4d, pud, init) DEFINE_POPULATE(pgd_populate, pgd, p4d, init) DEFINE_POPULATE(pud_populate, pud, pmd, init) DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init) #define DEFINE_ENTRY(type1, type2, init) \ static inline void set_##type1##_init(type1##_t *arg1, \ type2##_t arg2, bool init) \ { \ if (init) \ set_##type1##_safe(arg1, arg2); \ else \ set_##type1(arg1, arg2); \ } DEFINE_ENTRY(p4d, p4d, init) DEFINE_ENTRY(pud, pud, init) DEFINE_ENTRY(pmd, pmd, init) DEFINE_ENTRY(pte, pte, init) static inline pgprot_t prot_sethuge(pgprot_t prot) { WARN_ON_ONCE(pgprot_val(prot) & _PAGE_PAT); return __pgprot(pgprot_val(prot) | _PAGE_PSE); } /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move * around without checking the pgd every time. */ /* Bits supported by the hardware: */ pteval_t __supported_pte_mask __read_mostly = ~0; /* Bits allowed in normal kernel mappings: */ pteval_t __default_kernel_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ EXPORT_SYMBOL(__default_kernel_pte_mask); int force_personality32; /* * noexec32=on|off * Control non executable heap for 32bit processes. * * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) * off PROT_READ implies PROT_EXEC */ static int __init nonx32_setup(char *str) { if (!strcmp(str, "on")) force_personality32 &= ~READ_IMPLIES_EXEC; else if (!strcmp(str, "off")) force_personality32 |= READ_IMPLIES_EXEC; return 1; } __setup("noexec32=", nonx32_setup); static void sync_global_pgds_l5(unsigned long start, unsigned long end) { unsigned long addr; for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { const pgd_t *pgd_ref = pgd_offset_k(addr); struct page *page; /* Check for overflow */ if (addr < start) break; if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(addr); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } static void sync_global_pgds_l4(unsigned long start, unsigned long end) { unsigned long addr; for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { pgd_t *pgd_ref = pgd_offset_k(addr); const p4d_t *p4d_ref; struct page *page; /* * With folded p4d, pgd_none() is always false, we need to * handle synchronization on p4d level. */ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); if (p4d_none(*p4d_ref)) continue; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; p4d_t *p4d; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(addr); p4d = p4d_offset(pgd, addr); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) BUG_ON(p4d_pgtable(*p4d) != p4d_pgtable(*p4d_ref)); if (p4d_none(*p4d)) set_p4d(p4d, *p4d_ref); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } /* * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ static void sync_global_pgds(unsigned long start, unsigned long end) { if (pgtable_l5_enabled()) sync_global_pgds_l5(start, end); else sync_global_pgds_l4(start, end); } /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. */ static __ref void *spp_getpage(void) { void *ptr; if (after_bootmem) ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem ? "after bootmem" : ""); } pr_debug("spp_getpage %p\n", ptr); return ptr; } static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { p4d_t *p4d = (p4d_t *)spp_getpage(); pgd_populate(&init_mm, pgd, p4d); if (p4d != p4d_offset(pgd, 0)) printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", p4d, p4d_offset(pgd, 0)); } return p4d_offset(pgd, vaddr); } static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) { if (p4d_none(*p4d)) { pud_t *pud = (pud_t *)spp_getpage(); p4d_populate(&init_mm, p4d, pud); if (pud != pud_offset(p4d, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", pud, pud_offset(p4d, 0)); } return pud_offset(p4d, vaddr); } static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) { if (pud_none(*pud)) { pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", pmd, pmd_offset(pud, 0)); } return pmd_offset(pud, vaddr); } static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) { if (pmd_none(*pmd)) { pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #03!\n"); } return pte_offset_kernel(pmd, vaddr); } static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) { pmd_t *pmd = fill_pmd(pud, vaddr); pte_t *pte = fill_pte(pmd, vaddr); set_pte(pte, new_pte); /* * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ flush_tlb_one_kernel(vaddr); } void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) { p4d_t *p4d = p4d_page + p4d_index(vaddr); pud_t *pud = fill_pud(p4d, vaddr); __set_pte_vaddr(pud, vaddr, new_pte); } void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) { pud_t *pud = pud_page + pud_index(vaddr); __set_pte_vaddr(pud, vaddr, new_pte); } void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; p4d_t *p4d_page; pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { printk(KERN_ERR "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } p4d_page = p4d_offset(pgd, 0); set_pte_vaddr_p4d(p4d_page, vaddr, pteval); } pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(vaddr); p4d = fill_p4d(pgd, vaddr); pud = fill_pud(p4d, vaddr); return fill_pmd(pud, vaddr); } pte_t * __init populate_extra_pte(unsigned long vaddr) { pmd_t *pmd; pmd = populate_extra_pmd(vaddr); return fill_pte(pmd, vaddr); } /* * Create large page table mappings for a range of physical addresses. */ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, enum page_cache_mode cache) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgprot_t prot; pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | protval_4k_2_large(cachemode2protval(cache)); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); if (pgd_none(*pgd)) { p4d = (p4d_t *) spp_getpage(); set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | _PAGE_USER)); } p4d = p4d_offset(pgd, (unsigned long)__va(phys)); if (p4d_none(*p4d)) { pud = (pud_t *) spp_getpage(); set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | _PAGE_USER)); } pud = pud_offset(p4d, (unsigned long)__va(phys)); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); } pmd = pmd_offset(pud, phys); BUG_ON(!pmd_none(*pmd)); set_pmd(pmd, __pmd(phys | pgprot_val(prot))); } } void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) { __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); } void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) { __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); } /* * The head.S code sets up the kernel high mapping: * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * * phys_base holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * * We limit the mappings to the region from _text to _brk_end. _brk_end * is rounded up to the 2MB boundary. This catches the invalid pmds as * well, as they are located before _text: */ void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; /* * Native path, max_pfn_mapped is not set yet. * Xen has valid max_pfn_mapped set in * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). */ if (max_pfn_mapped) vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; if (vaddr < (unsigned long) _text || vaddr > end) set_pmd(pmd, __pmd(0)); } } /* * Create PTE level page table mapping for physical addresses. * It returns the last physical address mapped. */ static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; pte_t *pte; int i; pte = pte_page + pte_index(paddr); i = pte_index(paddr); for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) { paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE; if (paddr >= paddr_end) { if (!after_bootmem && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_ACPI)) set_pte_init(pte, __pte(0), init); continue; } /* * We will re-use the existing mapping. * Xen for example has some special requirements, like mapping * pagetable pages as RO. So assume someone who pre-setup * these mappings are more intelligent. */ if (!pte_none(*pte)) { if (!after_bootmem) pages++; continue; } if (0) pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } update_page_count(PG_LEVEL_4K, pages); return paddr_last; } /* * Create PMD level page table mapping for physical addresses. The virtual * and physical address have to be aligned at this level. * It returns the last physical address mapped. */ static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, unsigned long page_size_mask, pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; int i = pmd_index(paddr); for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) { pmd_t *pmd = pmd_page + pmd_index(paddr); pte_t *pte; pgprot_t new_prot = prot; paddr_next = (paddr & PMD_MASK) + PMD_SIZE; if (paddr >= paddr_end) { if (!after_bootmem && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_ACPI)) set_pmd_init(pmd, __pmd(0), init); continue; } if (!pmd_none(*pmd)) { if (!pmd_leaf(*pmd)) { spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); paddr_last = phys_pte_init(pte, paddr, paddr_end, prot, init); spin_unlock(&init_mm.page_table_lock); continue; } /* * If we are ok with PG_LEVEL_2M mapping, then we will * use the existing mapping, * * Otherwise, we will split the large page mapping but * use the same existing protection bits except for * large page, so that we don't violate Intel's TLB * Application note (317080) which says, while changing * the page sizes, new and old translations should * not differ with respect to page frame and * attributes. */ if (page_size_mask & (1 << PG_LEVEL_2M)) { if (!after_bootmem) pages++; paddr_last = paddr_next; continue; } new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; spin_lock(&init_mm.page_table_lock); set_pmd_init(pmd, pfn_pmd(paddr >> PAGE_SHIFT, prot_sethuge(prot)), init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; } pte = alloc_low_page(); paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init); spin_lock(&init_mm.page_table_lock); pmd_populate_kernel_init(&init_mm, pmd, pte, init); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); return paddr_last; } /* * Create PUD level page table mapping for physical addresses. The virtual * and physical address do not have to be aligned at this level. KASLR can * randomize virtual addresses up to this level. * It returns the last physical address mapped. */ static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, unsigned long page_size_mask, pgprot_t _prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; unsigned long vaddr = (unsigned long)__va(paddr); int i = pud_index(vaddr); for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) { pud_t *pud; pmd_t *pmd; pgprot_t prot = _prot; vaddr = (unsigned long)__va(paddr); pud = pud_page + pud_index(vaddr); paddr_next = (paddr & PUD_MASK) + PUD_SIZE; if (paddr >= paddr_end) { if (!after_bootmem && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_ACPI)) set_pud_init(pud, __pud(0), init); continue; } if (!pud_none(*pud)) { if (!pud_leaf(*pud)) { pmd = pmd_offset(pud, 0); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, page_size_mask, prot, init); continue; } /* * If we are ok with PG_LEVEL_1G mapping, then we will * use the existing mapping. * * Otherwise, we will split the gbpage mapping but use * the same existing protection bits except for large * page, so that we don't violate Intel's TLB * Application note (317080) which says, while changing * the page sizes, new and old translations should * not differ with respect to page frame and * attributes. */ if (page_size_mask & (1 << PG_LEVEL_1G)) { if (!after_bootmem) pages++; paddr_last = paddr_next; continue; } prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; spin_lock(&init_mm.page_table_lock); set_pud_init(pud, pfn_pud(paddr >> PAGE_SHIFT, prot_sethuge(prot)), init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; } pmd = alloc_low_page(); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, page_size_mask, prot, init); spin_lock(&init_mm.page_table_lock); pud_populate_init(&init_mm, pud, pmd, init); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_1G, pages); return paddr_last; } static unsigned long __meminit phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, unsigned long page_size_mask, pgprot_t prot, bool init) { unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last; paddr_last = paddr_end; vaddr = (unsigned long)__va(paddr); vaddr_end = (unsigned long)__va(paddr_end); if (!pgtable_l5_enabled()) return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask, prot, init); for (; vaddr < vaddr_end; vaddr = vaddr_next) { p4d_t *p4d = p4d_page + p4d_index(vaddr); pud_t *pud; vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE; paddr = __pa(vaddr); if (paddr >= paddr_end) { paddr_next = __pa(vaddr_next); if (!after_bootmem && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_ACPI)) set_p4d_init(p4d, __p4d(0), init); continue; } if (!p4d_none(*p4d)) { pud = pud_offset(p4d, 0); paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end), page_size_mask, prot, init); continue; } pud = alloc_low_page(); paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end), page_size_mask, prot, init); spin_lock(&init_mm.page_table_lock); p4d_populate_init(&init_mm, p4d, pud, init); spin_unlock(&init_mm.page_table_lock); } return paddr_last; } static unsigned long __meminit __kernel_physical_mapping_init(unsigned long paddr_start, unsigned long paddr_end, unsigned long page_size_mask, pgprot_t prot, bool init) { bool pgd_changed = false; unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; paddr_last = paddr_end; vaddr = (unsigned long)__va(paddr_start); vaddr_end = (unsigned long)__va(paddr_end); vaddr_start = vaddr; for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); p4d_t *p4d; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { p4d = (p4d_t *)pgd_page_vaddr(*pgd); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask, prot, init); continue; } p4d = alloc_low_page(); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask, prot, init); spin_lock(&init_mm.page_table_lock); if (pgtable_l5_enabled()) pgd_populate_init(&init_mm, pgd, p4d, init); else p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d, init); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } if (pgd_changed) sync_global_pgds(vaddr_start, vaddr_end - 1); return paddr_last; } /* * Create page table mapping for the physical memory for specific physical * addresses. Note that it can only be used to populate non-present entries. * The virtual and physical addresses have to be aligned on PMD level * down. It returns the last physical address mapped. */ unsigned long __meminit kernel_physical_mapping_init(unsigned long paddr_start, unsigned long paddr_end, unsigned long page_size_mask, pgprot_t prot) { return __kernel_physical_mapping_init(paddr_start, paddr_end, page_size_mask, prot, true); } /* * This function is similar to kernel_physical_mapping_init() above with the * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe() * when updating the mapping. The caller is responsible to flush the TLBs after * the function returns. */ unsigned long __meminit kernel_physical_mapping_change(unsigned long paddr_start, unsigned long paddr_end, unsigned long page_size_mask) { return __kernel_physical_mapping_init(paddr_start, paddr_end, page_size_mask, PAGE_KERNEL, false); } #ifndef CONFIG_NUMA static __always_inline void x86_numa_init(void) { memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); } #endif void __init initmem_init(void) { x86_numa_init(); } void __init paging_init(void) { sparse_init(); /* * clear the default setting with node 0 * note: don't use nodes_clear here, that is really clearing when * numa support is not compiled in, and later node_set_state * will not set it back. */ node_clear_state(0, N_MEMORY); node_clear_state(0, N_NORMAL_MEMORY); zone_sizes_init(); } #define PAGE_UNUSED 0xFD /* * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges * from unused_pmd_start to next PMD_SIZE boundary. */ static unsigned long unused_pmd_start __meminitdata; static void __meminit vmemmap_flush_unused_pmd(void) { if (!unused_pmd_start) return; /* * Clears (unused_pmd_start, PMD_END] */ memset((void *)unused_pmd_start, PAGE_UNUSED, ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start); unused_pmd_start = 0; } #ifdef CONFIG_MEMORY_HOTPLUG /* Returns true if the PMD is completely unused and thus it can be freed */ static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) { unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); /* * Flush the unused range cache to ensure that memchr_inv() will work * for the whole range. */ vmemmap_flush_unused_pmd(); memset((void *)addr, PAGE_UNUSED, end - addr); return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE); } #endif static void __meminit __vmemmap_use_sub_pmd(unsigned long start) { /* * As we expect to add in the same granularity as we remove, it's * sufficient to mark only some piece used to block the memmap page from * getting removed when removing some other adjacent memmap (just in * case the first memmap never gets initialized e.g., because the memory * block never gets onlined). */ memset((void *)start, 0, sizeof(struct page)); } static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end) { /* * We only optimize if the new used range directly follows the * previously unused range (esp., when populating consecutive sections). */ if (unused_pmd_start == start) { if (likely(IS_ALIGNED(end, PMD_SIZE))) unused_pmd_start = 0; else unused_pmd_start = end; return; } /* * If the range does not contiguously follows previous one, make sure * to mark the unused range of the previous one so it can be removed. */ vmemmap_flush_unused_pmd(); __vmemmap_use_sub_pmd(start); } static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) { const unsigned long page = ALIGN_DOWN(start, PMD_SIZE); vmemmap_flush_unused_pmd(); /* * Could be our memmap page is filled with PAGE_UNUSED already from a * previous remove. Make sure to reset it. */ __vmemmap_use_sub_pmd(start); /* * Mark with PAGE_UNUSED the unused parts of the new memmap range */ if (!IS_ALIGNED(start, PMD_SIZE)) memset((void *)page, PAGE_UNUSED, start - page); /* * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of * consecutive sections. Remember for the last added PMD where the * unused range begins. */ if (!IS_ALIGNED(end, PMD_SIZE)) unused_pmd_start = end; } /* * Memory hotplug specific functions */ #ifdef CONFIG_MEMORY_HOTPLUG /* * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need * updating. */ static void update_end_of_memory_vars(u64 start, u64 size) { unsigned long end_pfn = PFN_UP(start + size); if (end_pfn > max_pfn) { max_pfn = end_pfn; max_low_pfn = end_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; } } int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params) { unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END)) return -ERANGE; ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); /* * Special case: add_pages() is called by memremap_pages() for adding device * private pages. Do not bump up max_pfn in the device private path, * because max_pfn changes affect dma_addressing_limited(). * * dma_addressing_limited() returning true when max_pfn is the device's * addressable memory can force device drivers to use bounce buffers * and impact their performance negatively: */ if (!params->pgmap) /* update max_pfn, max_low_pfn and high_memory */ update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); return ret; } int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size, params->pgprot); return add_pages(nid, start_pfn, nr_pages, params); } static void free_reserved_pages(struct page *page, unsigned long nr_pages) { while (nr_pages--) free_reserved_page(page++); } static void __meminit free_pagetable(struct page *page, int order) { /* bootmem page has reserved flag */ if (PageReserved(page)) { unsigned long nr_pages = 1 << order; #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE enum bootmem_type type = bootmem_type(page); if (type == SECTION_INFO || type == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); } else { free_reserved_pages(page, nr_pages); } #else free_reserved_pages(page, nr_pages); #endif } else { free_pages((unsigned long)page_address(page), order); } } static void __meminit free_hugepage_table(struct page *page, struct vmem_altmap *altmap) { if (altmap) vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE); else free_pagetable(page, get_order(PMD_SIZE)); } static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) { pte_t *pte; int i; for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; if (!pte_none(*pte)) return; } /* free a pte table */ free_pagetable(pmd_page(*pmd), 0); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); } static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) { pmd_t *pmd; int i; for (i = 0; i < PTRS_PER_PMD; i++) { pmd = pmd_start + i; if (!pmd_none(*pmd)) return; } /* free a pmd table */ free_pagetable(pud_page(*pud), 0); spin_lock(&init_mm.page_table_lock); pud_clear(pud); spin_unlock(&init_mm.page_table_lock); } static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) { pud_t *pud; int i; for (i = 0; i < PTRS_PER_PUD; i++) { pud = pud_start + i; if (!pud_none(*pud)) return; } /* free a pud table */ free_pagetable(p4d_page(*p4d), 0); spin_lock(&init_mm.page_table_lock); p4d_clear(p4d); spin_unlock(&init_mm.page_table_lock); } static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) { unsigned long next, pages = 0; pte_t *pte; phys_addr_t phys_addr; pte = pte_start + pte_index(addr); for (; addr < end; addr = next, pte++) { next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; if (!pte_present(*pte)) continue; /* * We mapped [0,1G) memory as identity mapping when * initializing, in arch/x86/kernel/head_64.S. These * pagetables cannot be removed. */ phys_addr = pte_val(*pte) + (addr & PAGE_MASK); if (phys_addr < (phys_addr_t)0x40000000) return; if (!direct) free_pagetable(pte_page(*pte), 0); spin_lock(&init_mm.page_table_lock); pte_clear(&init_mm, addr, pte); spin_unlock(&init_mm.page_table_lock); /* For non-direct mapping, pages means nothing. */ pages++; } /* Call free_pte_table() in remove_pmd_table(). */ flush_tlb_all(); if (direct) update_page_count(PG_LEVEL_4K, -pages); } static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, bool direct, struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte_base; pmd_t *pmd; pmd = pmd_start + pmd_index(addr); for (; addr < end; addr = next, pmd++) { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) continue; if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) free_hugepage_table(pmd_page(*pmd), altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); pages++; } else if (vmemmap_pmd_is_unused(addr, next)) { free_hugepage_table(pmd_page(*pmd), altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); } continue; } pte_base = (pte_t *)pmd_page_vaddr(*pmd); remove_pte_table(pte_base, addr, next, direct); free_pte_table(pte_base, pmd); } /* Call free_pmd_table() in remove_pud_table(). */ if (direct) update_page_count(PG_LEVEL_2M, -pages); } static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, struct vmem_altmap *altmap, bool direct) { unsigned long next, pages = 0; pmd_t *pmd_base; pud_t *pud; pud = pud_start + pud_index(addr); for (; addr < end; addr = next, pud++) { next = pud_addr_end(addr, end); if (!pud_present(*pud)) continue; if (pud_leaf(*pud) && IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { spin_lock(&init_mm.page_table_lock); pud_clear(pud); spin_unlock(&init_mm.page_table_lock); pages++; continue; } pmd_base = pmd_offset(pud, 0); remove_pmd_table(pmd_base, addr, next, direct, altmap); free_pmd_table(pmd_base, pud); } if (direct) update_page_count(PG_LEVEL_1G, -pages); } static void __meminit remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, struct vmem_altmap *altmap, bool direct) { unsigned long next, pages = 0; pud_t *pud_base; p4d_t *p4d; p4d = p4d_start + p4d_index(addr); for (; addr < end; addr = next, p4d++) { next = p4d_addr_end(addr, end); if (!p4d_present(*p4d)) continue; BUILD_BUG_ON(p4d_leaf(*p4d)); pud_base = pud_offset(p4d, 0); remove_pud_table(pud_base, addr, next, altmap, direct); /* * For 4-level page tables we do not want to free PUDs, but in the * 5-level case we should free them. This code will have to change * to adapt for boot-time switching between 4 and 5 level page tables. */ if (pgtable_l5_enabled()) free_pud_table(pud_base, p4d); } if (direct) update_page_count(PG_LEVEL_512G, -pages); } /* start and end are both virtual address. */ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct, struct vmem_altmap *altmap) { unsigned long next; unsigned long addr; pgd_t *pgd; p4d_t *p4d; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); if (!pgd_present(*pgd)) continue; p4d = p4d_offset(pgd, 0); remove_p4d_table(p4d, addr, next, altmap, direct); } flush_tlb_all(); } void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); remove_pagetable(start, end, false, altmap); } static void __meminit kernel_physical_mapping_remove(unsigned long start, unsigned long end) { start = (unsigned long)__va(start); end = (unsigned long)__va(end); remove_pagetable(start, end, true, NULL); } void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; __remove_pages(start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { #if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i; for_each_online_node(i) register_page_bootmem_info_node(NODE_DATA(i)); #endif } /* * Pre-allocates page-table pages for the vmalloc area in the kernel page-table. * Only the level which needs to be synchronized between all page-tables is * allocated because the synchronization can be expensive. */ static void __init preallocate_vmalloc_pages(void) { unsigned long addr; const char *lvl; for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; lvl = "p4d"; p4d = p4d_alloc(&init_mm, pgd, addr); if (!p4d) goto failed; if (pgtable_l5_enabled()) continue; /* * The goal here is to allocate all possibly required * hardware page tables pointed to by the top hardware * level. * * On 4-level systems, the P4D layer is folded away and * the above code does no preallocation. Below, go down * to the pud _software_ level to ensure the second * hardware level is allocated on 4-level systems too. */ lvl = "pud"; pud = pud_alloc(&init_mm, p4d, addr); if (!pud) goto failed; } return; failed: /* * The pages have to be there now or they will be missing in * process page-tables later. */ panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl); } void __init arch_mm_preinit(void) { pci_iommu_alloc(); } void __init mem_init(void) { /* clear_bss() already clear the empty_zero_page */ after_bootmem = 1; x86_init.hyper.init_after_bootmem(); /* * Must be done after boot memory is put on freelist, because here we * might set fields in deferred struct pages that have not yet been * initialized, and memblock_free_all() initializes all the reserved * deferred pages for us. */ register_page_bootmem_info(); /* Register memory areas for /proc/kcore */ if (get_gate_vma(&init_mm)) kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); preallocate_vmalloc_pages(); } int kernel_set_to_readonly; void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long)__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(_etext); unsigned long rodata_end = PFN_ALIGN(__end_rodata); unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); kernel_set_to_readonly = 1; /* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. * * We align all_end to PMD_SIZE because the existing mapping * is a full PMD. If we would align _brk_end to PAGE_SIZE we * split the PMD and the reminder between _brk_end and the end * of the PMD will remain mapped executable. * * Any PMD which was setup after the one which covers _brk_end * has been zapped already via cleanup_highmem(). */ all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); set_ftrace_ops_ro(); #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); set_memory_rw(start, (end-start) >> PAGE_SHIFT); printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif free_kernel_image_pages("unused kernel image (text/rodata gap)", (void *)text_end, (void *)rodata_start); free_kernel_image_pages("unused kernel image (rodata/data gap)", (void *)rodata_end, (void *)_sdata); } /* * Block size is the minimum amount of memory which can be hotplugged or * hotremoved. It must be power of two and must be equal or larger than * MIN_MEMORY_BLOCK_SIZE. */ #define MAX_BLOCK_SIZE (2UL << 30) /* Amount of ram needed to start using large blocks */ #define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30) /* Adjustable memory block size */ static unsigned long set_memory_block_size; int __init set_memory_block_size_order(unsigned int order) { unsigned long size = 1UL << order; if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE) return -EINVAL; set_memory_block_size = size; return 0; } static unsigned long probe_memory_block_size(void) { unsigned long boot_mem_end = max_pfn << PAGE_SHIFT; unsigned long bz; /* If memory block size has been set, then use it */ bz = set_memory_block_size; if (bz) goto done; /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */ if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) { bz = MIN_MEMORY_BLOCK_SIZE; goto done; } /* * When hotplug alignment is not a concern, maximize blocksize * to minimize overhead. Otherwise, align to the lesser of advice * alignment and end of memory alignment. */ bz = memory_block_advised_max_size(); if (!bz) { bz = MAX_BLOCK_SIZE; if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) goto done; } else { bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE); } /* Find the largest allowed block size that aligns to memory end */ for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) break; } done: pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); return bz; } static unsigned long memory_block_size_probed; unsigned long memory_block_size_bytes(void) { if (!memory_block_size_probed) memory_block_size_probed = probe_memory_block_size(); return memory_block_size_probed; } /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ static long __meminitdata addr_start, addr_end; static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) { pte_t entry; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start) pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); addr_start = addr; node_start = node; p_start = p; } addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; if (!IS_ALIGNED(addr, PMD_SIZE) || !IS_ALIGNED(next, PMD_SIZE)) vmemmap_use_new_sub_pmd(addr, next); } int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next) { int large = pmd_leaf(*pmd); if (pmd_leaf(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); vmemmap_use_sub_pmd(addr, next); } return large; } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { int err; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); if (end - start < PAGES_PER_SECTION * sizeof(struct page)) err = vmemmap_populate_basepages(start, end, node, NULL); else if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { pr_err_once("%s: no cpu support for altmap allocations\n", __func__); err = -ENOMEM; } else err = vmemmap_populate_basepages(start, end, node, NULL); if (!err) sync_global_pgds(start, end - 1); return err; } #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long nr_pages) { unsigned long addr = (unsigned long)start_page; unsigned long end = (unsigned long)(start_page + nr_pages); unsigned long next; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; unsigned int nr_pmd_pages; struct page *page; for (; addr < end; addr = next) { pte_t *pte = NULL; pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; } get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; } get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); pud = pud_offset(p4d, addr); if (pud_none(*pud)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; } get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; } if (!boot_cpu_has(X86_FEATURE_PSE) || !pmd_leaf(*pmd)) { next = (addr + PAGE_SIZE) & PAGE_MASK; get_page_bootmem(section_nr, pmd_page(*pmd), MIX_SECTION_INFO); pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) continue; get_page_bootmem(section_nr, pte_page(*pte), SECTION_INFO); } else { next = pmd_addr_end(addr, end); nr_pmd_pages = (next - addr) >> PAGE_SHIFT; page = pmd_page(*pmd); while (nr_pmd_pages--) get_page_bootmem(section_nr, page++, SECTION_INFO); } } } #endif void __meminit vmemmap_populate_print_last(void) { if (p_start) { pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); p_start = NULL; p_end = NULL; node_start = 0; } } |
| 27 27 27 27 27 27 27 27 27 27 27 27 27 1 27 26 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 2 2 25 27 27 26 27 27 29 27 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | // SPDX-License-Identifier: GPL-2.0-or-later /* Direct I/O support. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/sched/mm.h> #include <linux/task_io_accounting_ops.h> #include <linux/netfs.h> #include "internal.h" static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; size_t rsize; rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len); subreq->len = rsize; if (unlikely(rreq->io_streams[0].sreq_max_segs)) { size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize, rreq->io_streams[0].sreq_max_segs); if (limit < rsize) { subreq->len = limit; trace_netfs_sreq(subreq, netfs_sreq_trace_limited); } } trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); subreq->io_iter = rreq->buffer.iter; iov_iter_truncate(&subreq->io_iter, subreq->len); iov_iter_advance(&rreq->buffer.iter, subreq->len); } /* * Perform a read to a buffer from the server, slicing up the region to be read * according to the network rsize. */ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned long long start = rreq->start; ssize_t size = rreq->len; int ret = 0; do { struct netfs_io_subrequest *subreq; ssize_t slice; subreq = netfs_alloc_subrequest(rreq); if (!subreq) { ret = -ENOMEM; break; } subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start; subreq->len = size; __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { stream->front = subreq; if (!stream->active) { stream->collected_to = stream->front->start; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); } } trace_netfs_sreq(subreq, netfs_sreq_trace_added); spin_unlock(&rreq->lock); netfs_stat(&netfs_n_rh_download); if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } } netfs_prepare_dio_read_iterator(subreq); slice = subreq->len; size -= slice; start += slice; rreq->submitted += slice; if (size <= 0) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); } rreq->netfs_ops->issue_read(subreq); if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) netfs_wait_for_paused_read(rreq); if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) break; cond_resched(); } while (size > 0); if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); netfs_wake_collector(rreq); } return ret; } /* * Perform a read to an application buffer, bypassing the pagecache and the * local disk cache. */ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) { ssize_t ret; _enter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); return -EIO; } // TODO: Use bounce buffer if requested inode_dio_begin(rreq->inode); ret = netfs_dispatch_unbuffered_reads(rreq); if (!rreq->submitted) { netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); inode_dio_end(rreq->inode); ret = 0; goto out; } if (sync) ret = netfs_wait_for_read(rreq); else ret = -EIOCBQUEUED; out: _leave(" = %zd", ret); return ret; } /** * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read * @iocb: The I/O control descriptor describing the read * @iter: The output buffer (also specifies read length) * * Perform an unbuffered I/O or direct I/O from the file in @iocb to the * output buffer. No use is made of the pagecache. * * The caller must hold any appropriate locks. */ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) { struct netfs_io_request *rreq; ssize_t ret; size_t orig_count = iov_iter_count(iter); bool sync = is_sync_kiocb(iocb); _enter(""); if (!orig_count) return 0; /* Don't update atime */ ret = kiocb_write_and_wait(iocb, orig_count); if (ret < 0) return ret; file_accessed(iocb->ki_filp); rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, iocb->ki_pos, orig_count, iocb->ki_flags & IOCB_DIRECT ? NETFS_DIO_READ : NETFS_UNBUFFERED_READ); if (IS_ERR(rreq)) return PTR_ERR(rreq); netfs_stat(&netfs_n_rh_dio_read); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read); /* If this is an async op, we have to keep track of the destination * buffer for ourselves as the caller's iterator will be trashed when * we return. * * In such a case, extract an iterator to represent as much of the the * output buffer as we can manage. Note that the extraction might not * be able to allocate a sufficiently large bvec array and may shorten * the request. */ if (user_backed_iter(iter)) { ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); if (ret < 0) goto out; rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; rreq->direct_bv_count = ret; rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); rreq->len = iov_iter_count(&rreq->buffer.iter); } else { rreq->buffer.iter = *iter; rreq->len = orig_count; rreq->direct_bv_unpin = false; iov_iter_advance(iter, orig_count); } // TODO: Set up bounce buffer if needed if (!sync) { rreq->iocb = iocb; __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags); } ret = netfs_unbuffered_read(rreq, sync); if (ret < 0) goto out; /* May be -EIOCBQUEUED */ if (sync) { // TODO: Copy from bounce buffer iocb->ki_pos += rreq->transferred; ret = rreq->transferred; } out: netfs_put_request(rreq, netfs_rreq_trace_put_return); if (ret > 0) orig_count -= ret; return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); /** * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read * @iocb: The I/O control descriptor describing the read * @iter: The output buffer (also specifies read length) * * Perform an unbuffered I/O or direct I/O from the file in @iocb to the * output buffer. No use is made of the pagecache. */ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (!iter->count) return 0; /* Don't update atime */ ret = netfs_start_io_direct(inode); if (ret == 0) { ret = netfs_unbuffered_read_iter_locked(iocb, iter); netfs_end_io_direct(inode); } return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter); |
| 3754 3760 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | // SPDX-License-Identifier: GPL-2.0 /* * x86 specific code for irq_work * * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> #include <linux/irq_work.h> #include <linux/hardirq.h> #include <asm/apic.h> #include <asm/idtentry.h> #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> #ifdef CONFIG_X86_LOCAL_APIC DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { apic_eoi(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); } void arch_irq_work_raise(void) { if (!arch_irq_work_has_interrupt()) return; __apic_send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); } #endif |
| 17 17 17 1 17 1 17 1 17 1 17 16 16 16 4 4 16 16 2 1 2 1 17 17 17 16 2 14 18 18 17 17 17 18 16 18 17 18 16 18 7 18 18 21 19 19 19 18 18 19 20 19 23 22 21 20 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/sock_diag.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/packet_diag.h> #include <linux/percpu.h> #include <net/net_namespace.h> #include <net/sock.h> #include "internal.h" static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) { struct packet_diag_info pinfo; pinfo.pdi_index = po->ifindex; pinfo.pdi_version = po->tp_version; pinfo.pdi_reserve = po->tp_reserve; pinfo.pdi_copy_thresh = READ_ONCE(po->copy_thresh); pinfo.pdi_tstamp = READ_ONCE(po->tp_tstamp); pinfo.pdi_flags = 0; if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) pinfo.pdi_flags |= PDI_RUNNING; if (packet_sock_flag(po, PACKET_SOCK_AUXDATA)) pinfo.pdi_flags |= PDI_AUXDATA; if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) pinfo.pdi_flags |= PDI_ORIGDEV; if (READ_ONCE(po->vnet_hdr_sz)) pinfo.pdi_flags |= PDI_VNETHDR; if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) pinfo.pdi_flags |= PDI_LOSS; return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo); } static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb) { struct nlattr *mca; struct packet_mclist *ml; mca = nla_nest_start_noflag(nlskb, PACKET_DIAG_MCLIST); if (!mca) return -EMSGSIZE; rtnl_lock(); for (ml = po->mclist; ml; ml = ml->next) { struct packet_diag_mclist *dml; dml = nla_reserve_nohdr(nlskb, sizeof(*dml)); if (!dml) { rtnl_unlock(); nla_nest_cancel(nlskb, mca); return -EMSGSIZE; } dml->pdmc_index = ml->ifindex; dml->pdmc_type = ml->type; dml->pdmc_alen = ml->alen; dml->pdmc_count = ml->count; BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr)); memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr)); } rtnl_unlock(); nla_nest_end(nlskb, mca); return 0; } static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, struct sk_buff *nlskb) { struct packet_diag_ring pdr; if (!ring->pg_vec) return 0; pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT; pdr.pdr_block_nr = ring->pg_vec_len; pdr.pdr_frame_size = ring->frame_size; pdr.pdr_frame_nr = ring->frame_max + 1; if (ver > TPACKET_V2) { pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; pdr.pdr_features = ring->prb_bdqc.feature_req_word; } else { pdr.pdr_retire_tmo = 0; pdr.pdr_sizeof_priv = 0; pdr.pdr_features = 0; } return nla_put(nlskb, nl_type, sizeof(pdr), &pdr); } static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb) { int ret; mutex_lock(&po->pg_vec_lock); ret = pdiag_put_ring(&po->rx_ring, po->tp_version, PACKET_DIAG_RX_RING, skb); if (!ret) ret = pdiag_put_ring(&po->tx_ring, po->tp_version, PACKET_DIAG_TX_RING, skb); mutex_unlock(&po->pg_vec_lock); return ret; } static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb) { int ret = 0; mutex_lock(&fanout_mutex); if (po->fanout) { u32 val; val = (u32)po->fanout->id | ((u32)po->fanout->type << 16); ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val); } mutex_unlock(&fanout_mutex); return ret; } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag_req *req, bool may_report_filterinfo, struct user_namespace *user_ns, u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct packet_diag_msg *rp; struct packet_sock *po = pkt_sk(sk); nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags); if (!nlh) return -EMSGSIZE; rp = nlmsg_data(nlh); rp->pdiag_family = AF_PACKET; rp->pdiag_type = sk->sk_type; rp->pdiag_num = ntohs(READ_ONCE(po->num)); rp->pdiag_ino = sk_ino; sock_diag_save_cookie(sk, rp->pdiag_cookie); if ((req->pdiag_show & PACKET_SHOW_INFO) && pdiag_put_info(po, skb)) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_INFO) && nla_put_u32(skb, PACKET_DIAG_UID, from_kuid_munged(user_ns, sk_uid(sk)))) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_MCLIST) && pdiag_put_mclist(po, skb)) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_RING_CFG) && pdiag_put_rings_cfg(po, skb)) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_FANOUT) && pdiag_put_fanout(po, skb)) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO)) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_FILTER) && sock_diag_put_filterinfo(may_report_filterinfo, sk, skb, PACKET_DIAG_FILTER)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int num = 0, s_num = cb->args[0]; struct packet_diag_req *req; struct net *net; struct sock *sk; bool may_report_filterinfo; net = sock_net(skb->sk); req = nlmsg_data(cb->nlh); may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN); mutex_lock(&net->packet.sklist_lock); sk_for_each(sk, &net->packet.sklist) { if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (sk_diag_fill(sk, skb, req, may_report_filterinfo, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sock_i_ino(sk)) < 0) goto done; next: num++; } done: mutex_unlock(&net->packet.sklist_lock); cb->args[0] = num; return skb->len; } static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct packet_diag_req); struct net *net = sock_net(skb->sk); struct packet_diag_req *req; if (nlmsg_len(h) < hdrlen) return -EINVAL; req = nlmsg_data(h); /* Make it possible to support protocol filtering later */ if (req->sdiag_protocol) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = packet_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else return -EOPNOTSUPP; } static const struct sock_diag_handler packet_diag_handler = { .owner = THIS_MODULE, .family = AF_PACKET, .dump = packet_diag_handler_dump, }; static int __init packet_diag_init(void) { return sock_diag_register(&packet_diag_handler); } static void __exit packet_diag_exit(void) { sock_diag_unregister(&packet_diag_handler); } module_init(packet_diag_init); module_exit(packet_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("PACKET socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */); |
| 226 189 199 220 189 52 214 215 214 215 213 215 215 215 215 215 215 213 215 194 129 129 129 66 129 129 193 193 41 194 235 194 194 193 194 194 194 194 194 194 194 194 194 194 194 194 185 153 151 151 3 153 194 185 203 203 203 201 203 203 203 202 203 203 226 226 226 17 17 235 235 235 235 235 235 235 52 234 200 187 235 235 235 226 3 3 3 3 2 2 7 7 5 3 4 4 6 4 7 151 100 100 146 97 144 145 151 2 151 151 151 151 151 151 151 146 151 185 185 150 34 34 5 151 151 2 150 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 | /* SPDX-License-Identifier: GPL-2.0 * * page_pool.c * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> #include <linux/mm.h> /* for put_page() */ #include <linux/poison.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <trace/events/page_pool.h> #include "dev.h" #include "mp_dmabuf_devmem.h" #include "netmem_priv.h" #include "page_pool_priv.h" DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) #define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ #define recycle_stat_inc(pool, __stat) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_inc(s->__stat); \ } while (0) #define recycle_stat_add(pool, __stat, val) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_add(s->__stat, val); \ } while (0) static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_alloc_fast", "rx_pp_alloc_slow", "rx_pp_alloc_slow_ho", "rx_pp_alloc_empty", "rx_pp_alloc_refill", "rx_pp_alloc_waive", "rx_pp_recycle_cached", "rx_pp_recycle_cache_full", "rx_pp_recycle_ring", "rx_pp_recycle_ring_full", "rx_pp_recycle_released_ref", }; /** * page_pool_get_stats() - fetch page pool stats * @pool: pool from which page was allocated * @stats: struct page_pool_stats to fill in * * Retrieve statistics about the page_pool. This API is only available * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. * A pointer to a caller allocated struct page_pool_stats structure * is passed to this API which is filled in. The caller can then report * those stats to the user (perhaps via ethtool, debugfs, etc.). */ bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; if (!stats) return false; /* The caller is responsible to initialize stats. */ stats->alloc_stats.fast += pool->alloc_stats.fast; stats->alloc_stats.slow += pool->alloc_stats.slow; stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; stats->alloc_stats.empty += pool->alloc_stats.empty; stats->alloc_stats.refill += pool->alloc_stats.refill; stats->alloc_stats.waive += pool->alloc_stats.waive; for_each_possible_cpu(cpu) { const struct page_pool_recycle_stats *pcpu = per_cpu_ptr(pool->recycle_stats, cpu); stats->recycle_stats.cached += pcpu->cached; stats->recycle_stats.cache_full += pcpu->cache_full; stats->recycle_stats.ring += pcpu->ring; stats->recycle_stats.ring_full += pcpu->ring_full; stats->recycle_stats.released_refcnt += pcpu->released_refcnt; } return true; } EXPORT_SYMBOL(page_pool_get_stats); u8 *page_pool_ethtool_stats_get_strings(u8 *data) { int i; for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { memcpy(data, pp_stats[i], ETH_GSTRING_LEN); data += ETH_GSTRING_LEN; } return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); int page_pool_ethtool_stats_get_count(void) { return ARRAY_SIZE(pp_stats); } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; *data++ = pool_stats->alloc_stats.slow_high_order; *data++ = pool_stats->alloc_stats.empty; *data++ = pool_stats->alloc_stats.refill; *data++ = pool_stats->alloc_stats.waive; *data++ = pool_stats->recycle_stats.cached; *data++ = pool_stats->recycle_stats.cache_full; *data++ = pool_stats->recycle_stats.ring; *data++ = pool_stats->recycle_stats.ring_full; *data++ = pool_stats->recycle_stats.released_refcnt; return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else #define alloc_stat_inc(...) do { } while (0) #define recycle_stat_inc(...) do { } while (0) #define recycle_stat_add(...) do { } while (0) #endif static bool page_pool_producer_lock(struct page_pool *pool) __acquires(&pool->ring.producer_lock) { bool in_softirq = in_softirq(); if (in_softirq) spin_lock(&pool->ring.producer_lock); else spin_lock_bh(&pool->ring.producer_lock); return in_softirq; } static void page_pool_producer_unlock(struct page_pool *pool, bool in_softirq) __releases(&pool->ring.producer_lock) { if (in_softirq) spin_unlock(&pool->ring.producer_lock); else spin_unlock_bh(&pool->ring.producer_lock); } static void page_pool_struct_check(void) { CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, PAGE_POOL_FRAG_GROUP_ALIGN); } static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params, int cpuid) { unsigned int ring_qsize = 1024; /* Default */ struct netdev_rx_queue *rxq; int err; page_pool_struct_check(); memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) ring_qsize = pool->p.pool_size; /* Sanity limit mem that can be pinned down */ if (ring_qsize > 32768) return -E2BIG; /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; pool->dma_map = true; } if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; pool->dma_sync = true; /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ } pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) return -ENOMEM; } else { /* For system page pool instance we use a singular stats object * instead of allocating a separate percpu variable for each * (also percpu) page pool instance. */ pool->recycle_stats = &pp_system_recycle_stats; pool->system = true; } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return -ENOMEM; } atomic_set(&pool->pages_state_release_cnt, 0); /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; pool->mp_ops = rxq->mp_params.mp_ops; } if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) { err = -EOPNOTSUPP; goto free_ptr_ring; } if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { err = -EFAULT; goto free_ptr_ring; } err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); goto free_ptr_ring; } static_branch_inc(&page_pool_mem_providers); } return 0; free_ptr_ring: ptr_ring_cleanup(&pool->ring, NULL); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return err; } static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif } /** * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params * @cpuid: cpu identifier */ struct page_pool * page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); if (!pool) return ERR_PTR(-ENOMEM); err = page_pool_init(pool, params, cpuid); if (err < 0) goto err_free; err = page_pool_list(pool); if (err) goto err_uninit; return pool; err_uninit: page_pool_uninit(pool); err_free: pr_warn("%s() gave up with errno %d\n", __func__, err); kfree(pool); return ERR_PTR(err); } EXPORT_SYMBOL(page_pool_create_percpu); /** * page_pool_create() - create a page pool * @params: parameters, see struct page_pool_params */ struct page_pool *page_pool_create(const struct page_pool_params *params) { return page_pool_create_percpu(params, -1); } EXPORT_SYMBOL(page_pool_create); static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem); static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; netmem_ref netmem; int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) { alloc_stat_inc(pool, empty); return 0; } /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. */ #ifdef CONFIG_NUMA pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; #else /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ #endif /* Refill alloc array, but only if NUMA match */ do { netmem = (__force netmem_ref)__ptr_ring_consume(r); if (unlikely(!netmem)) break; if (likely(netmem_is_pref_nid(netmem, pref_nid))) { pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; * (1) release 1 page to page-allocator and * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ page_pool_return_netmem(pool, netmem); alloc_stat_inc(pool, waive); netmem = 0; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, refill); } return netmem; } /* fast path */ static netmem_ref __page_pool_get_cached(struct page_pool *pool) { netmem_ref netmem; /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ if (likely(pool->alloc.count)) { /* Fast-path */ netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, fast); } else { netmem = page_pool_refill_alloc_cache(pool); } return netmem; } static void __page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); dma_sync_size = min(dma_sync_size, pool->p.max_len); __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, dma_sync_size, pool->p.dma_dir); #endif } static __always_inline void page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { rcu_read_lock(); /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ if (pool->dma_sync) __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); rcu_read_unlock(); } } static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; int err; u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, (PAGE_SIZE << pool->p.order), pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; if (page_pool_set_dma_addr_netmem(netmem, dma)) { WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; } if (in_softirq()) err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), PP_DMA_INDEX_LIMIT, gfp); else err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), PP_DMA_INDEX_LIMIT, gfp); if (err) { WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); goto unset_failed; } netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; unset_failed: page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); return false; } static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { struct page *page; gfp |= __GFP_COMP; page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); if (unlikely(!page)) return NULL; if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } alloc_stat_inc(pool, slow_high_order); page_pool_set_pp_info(pool, page_to_netmem(page)); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page_to_netmem(page), pool->pages_state_hold_cnt); return page; } /* slow path */ static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool, gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; unsigned int pp_order = pool->p.order; bool dma_map = pool->dma_map; netmem_ref netmem; int i, nr_pages; /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); /* Unnecessary as alloc cache is empty, but guarantees zero count */ if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; /* Pages have been filled into alloc.cache array, but count is zero and * page element have not been (possibly) DMA mapped. */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } page_pool_set_pp_info(pool, netmem); pool->alloc.cache[pool->alloc.count++] = netmem; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, slow); } else { netmem = 0; } /* When page just alloc'ed is should/must have refcnt 1. */ return netmem; } /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; /* Fast-path: Get a page from cache */ netmem = __page_pool_get_cached(pool); if (netmem) return netmem; /* Slow-path: cache empty, do real allocation */ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_netmems_slow(pool, gfp); return netmem; } EXPORT_SYMBOL(page_pool_alloc_netmems); ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution */ #define _distance(a, b) (s32)((a) - (b)) s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); s32 inflight; inflight = _distance(hold_cnt, release_cnt); if (strict) { trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); } else { inflight = max(0, inflight); } return inflight; } void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { netmem_set_pp(netmem, pool); netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it * is allocated from the page allocator and page_pool_fragment_page() * is dirtying the same cache line as the page->pp_magic above, so * the overhead is negligible. */ page_pool_fragment_netmem(netmem, 1); if (pool->has_init_callback) pool->slow.init_callback(netmem, pool->slow.init_arg); } void page_pool_clear_pp_info(netmem_ref netmem) { netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, netmem_ref netmem) { struct page *old, *page = netmem_to_page(netmem); unsigned long id; dma_addr_t dma; if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ return; id = netmem_get_dma_index(netmem); if (!id) return; if (in_softirq()) old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); else old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); if (old != page) return; dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem) { int count; bool put; put = true; if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_netmem_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, netmem, count); if (put) { page_pool_clear_pp_info(netmem); put_page(netmem_to_page(netmem)); } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). */ } static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { bool in_softirq, ret; /* BH protection not needed if current is softirq */ in_softirq = page_pool_producer_lock(pool); ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); if (ret) recycle_stat_inc(pool, ring); page_pool_producer_unlock(pool, in_softirq); return ret; } /* Only allow direct recycling in special circumstances, into the * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. * * Caller must provide appropriate safe context. */ static bool page_pool_recycle_in_cache(netmem_ref netmem, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { recycle_stat_inc(pool, cache_full); return false; } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = netmem; recycle_stat_inc(pool, cached); return true; } static bool __page_pool_page_can_be_recycled(netmem_ref netmem) { return netmem_is_net_iov(netmem) || (page_ref_count(netmem_to_page(netmem)) == 1 && !page_is_pfmemalloc(netmem_to_page(netmem))); } /* If the page refcnt == 1, this will try to recycle the page. * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ static __always_inline netmem_ref __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { lockdep_assert_no_hardirq(); /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. * * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ if (likely(__page_pool_page_can_be_recycled(netmem))) { /* Read barrier done in page_ref_count / READ_ONCE */ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) return 0; /* Page found as candidate for recycling */ return netmem; } /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some * want to keep doing this to save memory and do refcnt based * recycling. Support this use case too, to ease drivers * switching between XDP/non-XDP. * * In-case page_pool maintains the DMA mapping, API user must * call page_pool_put_page once. In this elevated refcnt * case, the DMA is unmapped/released, as driver is likely * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); page_pool_return_netmem(pool, netmem); return 0; } static bool page_pool_napi_local(const struct page_pool *pool) { const struct napi_struct *napi; u32 cpuid; /* On PREEMPT_RT the softirq can be preempted by the consumer */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; if (unlikely(!in_softirq())) return false; /* Allow direct recycle if we have reasons to believe that we are * in the same context as the consumer would run, so there's * no possible race. * __page_pool_put_page() makes sure we're not in hardirq context * and interrupts are enabled prior to accessing the cache. */ cpuid = smp_processor_id(); if (READ_ONCE(pool->cpuid) == cpuid) return true; napi = READ_ONCE(pool->p.napi); return napi && READ_ONCE(napi->list_owner) == cpuid; } void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { if (!allow_direct) allow_direct = page_pool_napi_local(pool); netmem = __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_put_unrefed_netmem); void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } EXPORT_SYMBOL(page_pool_put_unrefed_page); static void page_pool_recycle_ring_bulk(struct page_pool *pool, netmem_ref *bulk, u32 bulk_len) { bool in_softirq; u32 i; /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } page_pool_producer_unlock(pool, in_softirq); recycle_stat_add(pool, ring, i); /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; /* * ptr_ring cache is full, free remaining pages outside producer lock * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) page_pool_return_netmem(pool, bulk[i]); } /** * page_pool_put_netmem_bulk() - release references on multiple netmems * @data: array holding netmem references * @count: number of entries in @data * * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() * will release leftover netmems to the memory provider. * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx * completion loop for the XDP_REDIRECT use case. * * Please note the caller must not use data area after running * page_pool_put_netmem_bulk(), as this function overwrites it. */ void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { u32 bulk_len = 0; for (u32 i = 0; i < count; i++) { netmem_ref netmem = netmem_compound_head(data[i]); if (page_pool_unref_and_test(netmem)) data[bulk_len++] = netmem; } count = bulk_len; while (count) { netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; struct page_pool *pool = NULL; bool allow_direct; u32 foreign = 0; bulk_len = 0; for (u32 i = 0; i < count; i++) { struct page_pool *netmem_pp; netmem_ref netmem = data[i]; netmem_pp = netmem_get_pp(netmem); if (unlikely(!pool)) { pool = netmem_pp; allow_direct = page_pool_napi_local(pool); } else if (netmem_pp != pool) { /* * If the netmem belongs to a different * page_pool, save it for another round. */ data[foreign++] = netmem; continue; } netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (netmem) bulk[bulk_len++] = netmem; } if (bulk_len) page_pool_recycle_ring_bulk(pool, bulk, bulk_len); count = foreign; } } EXPORT_SYMBOL(page_pool_put_netmem_bulk); static netmem_ref page_pool_drain_frag(struct page_pool *pool, netmem_ref netmem) { long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ if (likely(page_pool_unref_netmem(netmem, drain_count))) return 0; if (__page_pool_page_can_be_recycled(netmem)) { page_pool_dma_sync_for_device(pool, netmem, -1); return netmem; } page_pool_return_netmem(pool, netmem); return 0; } static void page_pool_free_frag(struct page_pool *pool) { long drain_count = BIAS_MAX - pool->frag_users; netmem_ref netmem = pool->frag_page; pool->frag_page = 0; if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; page_pool_return_netmem(pool, netmem); } netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem = pool->frag_page; if (WARN_ON(size > max_size)) return 0; size = ALIGN(size, dma_get_cache_alignment()); *offset = pool->frag_offset; if (netmem && *offset + size > max_size) { netmem = page_pool_drain_frag(pool, netmem); if (netmem) { recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } } if (!netmem) { netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; } pool->frag_page = netmem; frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; page_pool_fragment_netmem(netmem, BIAS_MAX); return netmem; } pool->frag_users++; pool->frag_offset = *offset + size; return netmem; } EXPORT_SYMBOL(page_pool_alloc_frag_netmem); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, gfp)); } EXPORT_SYMBOL(page_pool_alloc_frag); static void page_pool_empty_ring(struct page_pool *pool) { netmem_ref netmem; /* Empty recycle ring */ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", __func__, netmem_ref_count(netmem)); page_pool_return_netmem(pool, netmem); } } static void __page_pool_destroy(struct page_pool *pool) { if (pool->disconnect) pool->disconnect(pool); page_pool_unlist(pool); page_pool_uninit(pool); if (pool->mp_ops) { pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } kfree(pool); } static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { netmem_ref netmem; if (pool->destroy_cnt) return; /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_netmem(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { unsigned long id; void *ptr; page_pool_empty_alloc_cache_once(pool); if (!pool->destroy_cnt++ && pool->dma_map) { if (pool->dma_sync) { /* Disable page_pool_dma_sync_for_device() */ pool->dma_sync = false; /* Make sure all concurrent returns that may see the old * value of dma_sync (and thus perform a sync) have * finished before doing the unmapping below. Skip the * wait if the device doesn't actually need syncing, or * if there are no outstanding mapped pages. */ if (dma_dev_need_sync(pool->p.dev) && !xa_empty(&pool->dma_mapped)) synchronize_net(); } xa_for_each(&pool->dma_mapped, id, ptr) __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr)); } /* No more consumers should exist, but producers could still * be in-flight. */ page_pool_empty_ring(pool); } static int page_pool_release(struct page_pool *pool) { bool in_softirq; int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); /* Acquire producer lock to make sure producers have exited. */ in_softirq = page_pool_producer_lock(pool); page_pool_producer_unlock(pool, in_softirq); if (!inflight) __page_pool_destroy(pool); return inflight; } static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); void *netdev; int inflight; inflight = page_pool_release(pool); /* In rare cases, a driver bug may cause inflight to go negative. * Don't reschedule release if inflight is 0 or negative. * - If 0, the page_pool has been destroyed * - if negative, we will never recover * in both cases no reschedule is necessary. */ if (inflight <= 0) return; /* Periodic warning for page pools the user can't see */ netdev = READ_ONCE(pool->slow.netdev); if (time_after_eq(jiffies, pool->defer_warn) && (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } /* Still not ready to be disconnected, retry later */ schedule_delayed_work(&pool->release_dw, DEFER_TIME); } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } /** * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI * @pool: page pool to modify * @napi: NAPI instance to associate the page pool with * * Associate a page pool with a NAPI instance for lockless page recycling. * This is useful when a new page pool has to be added to a NAPI instance * without disabling that NAPI instance, to mark the point at which control * path "hands over" the page pool to the NAPI instance. In most cases driver * can simply set the @napi field in struct page_pool_params, and does not * have to call this helper. * * The function is idempotent, but does not implement any refcounting. * Single page_pool_disable_direct_recycling() will disable recycling, * no matter how many times enable was called. */ void page_pool_enable_direct_recycling(struct page_pool *pool, struct napi_struct *napi) { if (READ_ONCE(pool->p.napi) == napi) return; WARN_ON(!napi || pool->p.napi); mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, napi); mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_enable_direct_recycling); void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); if (!pool->p.napi) return; napi_assert_will_not_race(pool->p.napi); mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); void page_pool_destroy(struct page_pool *pool) { if (!pool) return; if (!page_pool_put(pool)) return; page_pool_disable_direct_recycling(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) return; page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); schedule_delayed_work(&pool->release_dw, DEFER_TIME); } EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { netmem_ref netmem; trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) { return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); } /* Associate a niov with a page pool. Should follow with a matching * net_mp_niov_clear_page_pool() */ void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); page_pool_set_pp_info(pool, netmem); pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Disassociate a niov from a page pool. Should only be used in the * ->release_netmem() path. */ void net_mp_niov_clear_page_pool(struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); page_pool_clear_pp_info(netmem); } |
| 71 9 1 24 34 34 34 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Queue of folios definitions * * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See: * * Documentation/core-api/folio_queue.rst * * for a description of the API. */ #ifndef _LINUX_FOLIO_QUEUE_H #define _LINUX_FOLIO_QUEUE_H #include <linux/pagevec.h> #include <linux/mm.h> /* * Segment in a queue of running buffers. Each segment can hold a number of * folios and a portion of the queue can be referenced with the ITER_FOLIOQ * iterator. The possibility exists of inserting non-folio elements into the * queue (such as gaps). * * Explicit prev and next pointers are used instead of a list_head to make it * easier to add segments to tail and remove them from the head without the * need for a lock. */ struct folio_queue { struct folio_batch vec; /* Folios in the queue segment */ u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ struct folio_queue *next; /* Next queue segment or NULL */ struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ #if PAGEVEC_SIZE > BITS_PER_LONG #error marks is not big enough #endif unsigned int rreq_id; unsigned int debug_id; }; /** * folioq_init - Initialise a folio queue segment * @folioq: The segment to initialise * @rreq_id: The request identifier to use in tracelines. * * Initialise a folio queue segment and set an identifier to be used in traces. * * Note that the folio pointers are left uninitialised. */ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) { folio_batch_init(&folioq->vec); folioq->next = NULL; folioq->prev = NULL; folioq->marks = 0; folioq->marks2 = 0; folioq->rreq_id = rreq_id; folioq->debug_id = 0; } /** * folioq_nr_slots: Query the capacity of a folio queue segment * @folioq: The segment to query * * Query the number of folios that a particular folio queue segment might hold. * [!] NOTE: This must not be assumed to be the same for every segment! */ static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) { return PAGEVEC_SIZE; } /** * folioq_count: Query the occupancy of a folio queue segment * @folioq: The segment to query * * Query the number of folios that have been added to a folio queue segment. * Note that this is not decreased as folios are removed from a segment. */ static inline unsigned int folioq_count(struct folio_queue *folioq) { return folio_batch_count(&folioq->vec); } /** * folioq_full: Query if a folio queue segment is full * @folioq: The segment to query * * Query if a folio queue segment is fully occupied. Note that this does not * change if folios are removed from a segment. */ static inline bool folioq_full(struct folio_queue *folioq) { //return !folio_batch_space(&folioq->vec); return folioq_count(folioq) >= folioq_nr_slots(folioq); } /** * folioq_is_marked: Check first folio mark in a folio queue segment * @folioq: The segment to query * @slot: The slot number of the folio to query * * Determine if the first mark is set for the folio in the specified slot in a * folio queue segment. */ static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) { return test_bit(slot, &folioq->marks); } /** * folioq_mark: Set the first mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Set the first mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) { set_bit(slot, &folioq->marks); } /** * folioq_unmark: Clear the first mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Clear the first mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) { clear_bit(slot, &folioq->marks); } /** * folioq_is_marked2: Check second folio mark in a folio queue segment * @folioq: The segment to query * @slot: The slot number of the folio to query * * Determine if the second mark is set for the folio in the specified slot in a * folio queue segment. */ static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) { return test_bit(slot, &folioq->marks2); } /** * folioq_mark2: Set the second mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Set the second mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) { set_bit(slot, &folioq->marks2); } /** * folioq_unmark2: Clear the second mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Clear the second mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) { clear_bit(slot, &folioq->marks2); } /** * folioq_append: Add a folio to a folio queue segment * @folioq: The segment to add to * @folio: The folio to add * * Add a folio to the tail of the sequence in a folio queue segment, increasing * the occupancy count and returning the slot number for the folio just added. * The folio size is extracted and stored in the queue and the marks are left * unmodified. * * Note that it's left up to the caller to check that the segment capacity will * not be exceeded and to extend the queue. */ static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) { unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; folioq->orders[slot] = folio_order(folio); return slot; } /** * folioq_append_mark: Add a folio to a folio queue segment * @folioq: The segment to add to * @folio: The folio to add * * Add a folio to the tail of the sequence in a folio queue segment, increasing * the occupancy count and returning the slot number for the folio just added. * The folio size is extracted and stored in the queue, the first mark is set * and and the second and third marks are left unmodified. * * Note that it's left up to the caller to check that the segment capacity will * not be exceeded and to extend the queue. */ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) { unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; folioq->orders[slot] = folio_order(folio); folioq_mark(folioq, slot); return slot; } /** * folioq_folio: Get a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the folio in the specified slot from a folio queue segment. Note * that no bounds check is made and if the slot hasn't been added into yet, the * pointer will be undefined. If the slot has been cleared, NULL will be * returned. */ static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) { return folioq->vec.folios[slot]; } /** * folioq_folio_order: Get the order of a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the order of the folio in the specified slot from a folio queue * segment. Note that no bounds check is made and if the slot hasn't been * added into yet, the order returned will be 0. */ static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) { return folioq->orders[slot]; } /** * folioq_folio_size: Get the size of a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the size of the folio in the specified slot from a folio queue * segment. Note that no bounds check is made and if the slot hasn't been * added into yet, the size returned will be PAGE_SIZE. */ static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) { return PAGE_SIZE << folioq_folio_order(folioq, slot); } /** * folioq_clear: Clear a folio from a folio queue segment * @folioq: The segment to clear * @slot: The folio slot to clear * * Clear a folio from a sequence in a folio queue segment and clear its marks. * The occupancy count is left unchanged. */ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) { folioq->vec.folios[slot] = NULL; folioq_unmark(folioq, slot); folioq_unmark2(folioq, slot); } #endif /* _LINUX_FOLIO_QUEUE_H */ |
| 7 7 4 3 3 3 2 10 8 9 7 3 10 5 2 5 3 10 10 10 10 4 11 6 10 3 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/scatterlist.h> #include <linux/sched.h> #include <linux/slab.h> #include <crypto/aes.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/sched/mm.h> #include <keys/ceph-type.h> #include <keys/user-type.h> #include <linux/ceph/decode.h> #include "crypto.h" /* * Set ->key and ->tfm. The rest of the key should be filled in before * this function is called. */ static int set_secret(struct ceph_crypto_key *key, void *buf) { unsigned int noio_flag; int ret; key->key = NULL; key->tfm = NULL; switch (key->type) { case CEPH_CRYPTO_NONE: return 0; /* nothing to do */ case CEPH_CRYPTO_AES: break; default: return -ENOTSUPP; } if (!key->len) return -EINVAL; key->key = kmemdup(buf, key->len, GFP_NOIO); if (!key->key) { ret = -ENOMEM; goto fail; } /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); memalloc_noio_restore(noio_flag); if (IS_ERR(key->tfm)) { ret = PTR_ERR(key->tfm); key->tfm = NULL; goto fail; } ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len); if (ret) goto fail; return 0; fail: ceph_crypto_key_destroy(key); return ret; } int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src) { memcpy(dst, src, sizeof(struct ceph_crypto_key)); return set_secret(dst, src->key); } int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) { int ret; ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); key->type = ceph_decode_16(p); ceph_decode_copy(p, &key->created, sizeof(key->created)); key->len = ceph_decode_16(p); ceph_decode_need(p, end, key->len, bad); ret = set_secret(key, *p); memzero_explicit(*p, key->len); *p += key->len; return ret; bad: dout("failed to decode crypto key\n"); return -EINVAL; } int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) { int inlen = strlen(inkey); int blen = inlen * 3 / 4; void *buf, *p; int ret; dout("crypto_key_unarmor %s\n", inkey); buf = kmalloc(blen, GFP_NOFS); if (!buf) return -ENOMEM; blen = ceph_unarmor(buf, inkey, inkey+inlen); if (blen < 0) { kfree(buf); return blen; } p = buf; ret = ceph_crypto_key_decode(key, &p, p + blen); kfree(buf); if (ret) return ret; dout("crypto_key_unarmor key %p type %d len %d\n", key, key->type, key->len); return 0; } void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { if (key) { kfree_sensitive(key->key); key->key = NULL; if (key->tfm) { crypto_free_sync_skcipher(key->tfm); key->tfm = NULL; } } } static const u8 *aes_iv = (u8 *)CEPH_AES_IV; /* * Should be used for buffers allocated with kvmalloc(). * Currently these are encrypt out-buffer (ceph_buffer) and decrypt * in-buffer (msg front). * * Dispose of @sgt with teardown_sgtable(). * * @prealloc_sg is to avoid memory allocation inside sg_alloc_table() * in cases where a single sg is sufficient. No attempt to reduce the * number of sgs by squeezing physically contiguous pages together is * made though, for simplicity. */ static int setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, const void *buf, unsigned int buf_len) { struct scatterlist *sg; const bool is_vmalloc = is_vmalloc_addr(buf); unsigned int off = offset_in_page(buf); unsigned int chunk_cnt = 1; unsigned int chunk_len = PAGE_ALIGN(off + buf_len); int i; int ret; if (buf_len == 0) { memset(sgt, 0, sizeof(*sgt)); return -EINVAL; } if (is_vmalloc) { chunk_cnt = chunk_len >> PAGE_SHIFT; chunk_len = PAGE_SIZE; } if (chunk_cnt > 1) { ret = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS); if (ret) return ret; } else { WARN_ON(chunk_cnt != 1); sg_init_table(prealloc_sg, 1); sgt->sgl = prealloc_sg; sgt->nents = sgt->orig_nents = 1; } for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) { struct page *page; unsigned int len = min(chunk_len - off, buf_len); if (is_vmalloc) page = vmalloc_to_page(buf); else page = virt_to_page(buf); sg_set_page(sg, page, len, off); off = 0; buf += len; buf_len -= len; } WARN_ON(buf_len != 0); return 0; } static void teardown_sgtable(struct sg_table *sgt) { if (sgt->orig_nents > 1) sg_free_table(sgt); } static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm); struct sg_table sgt; struct scatterlist prealloc_sg; char iv[AES_BLOCK_SIZE] __aligned(8); int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1)); int crypt_len = encrypt ? in_len + pad_byte : in_len; int ret; WARN_ON(crypt_len > buf_len); if (encrypt) memset(buf + in_len, pad_byte, pad_byte); ret = setup_sgtable(&sgt, &prealloc_sg, buf, crypt_len); if (ret) return ret; memcpy(iv, aes_iv, AES_BLOCK_SIZE); skcipher_request_set_sync_tfm(req, key->tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv); /* print_hex_dump(KERN_ERR, "key: ", DUMP_PREFIX_NONE, 16, 1, key->key, key->len, 1); print_hex_dump(KERN_ERR, " in: ", DUMP_PREFIX_NONE, 16, 1, buf, crypt_len, 1); */ if (encrypt) ret = crypto_skcipher_encrypt(req); else ret = crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (ret) { pr_err("%s %scrypt failed: %d\n", __func__, encrypt ? "en" : "de", ret); goto out_sgt; } /* print_hex_dump(KERN_ERR, "out: ", DUMP_PREFIX_NONE, 16, 1, buf, crypt_len, 1); */ if (encrypt) { *pout_len = crypt_len; } else { pad_byte = *(char *)(buf + in_len - 1); if (pad_byte > 0 && pad_byte <= AES_BLOCK_SIZE && in_len >= pad_byte) { *pout_len = in_len - pad_byte; } else { pr_err("%s got bad padding %d on in_len %d\n", __func__, pad_byte, in_len); ret = -EPERM; goto out_sgt; } } out_sgt: teardown_sgtable(&sgt); return ret; } int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { switch (key->type) { case CEPH_CRYPTO_NONE: *pout_len = in_len; return 0; case CEPH_CRYPTO_AES: return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len, pout_len); default: return -ENOTSUPP; } } static int ceph_key_preparse(struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; size_t datalen = prep->datalen; int ret; void *p; ret = -EINVAL; if (datalen <= 0 || datalen > 32767 || !prep->data) goto err; ret = -ENOMEM; ckey = kmalloc(sizeof(*ckey), GFP_KERNEL); if (!ckey) goto err; /* TODO ceph_crypto_key_decode should really take const input */ p = (void *)prep->data; ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); if (ret < 0) goto err_ckey; prep->payload.data[0] = ckey; prep->quotalen = datalen; return 0; err_ckey: kfree(ckey); err: return ret; } static void ceph_key_free_preparse(struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey = prep->payload.data[0]; ceph_crypto_key_destroy(ckey); kfree(ckey); } static void ceph_key_destroy(struct key *key) { struct ceph_crypto_key *ckey = key->payload.data[0]; ceph_crypto_key_destroy(ckey); kfree(ckey); } struct key_type key_type_ceph = { .name = "ceph", .preparse = ceph_key_preparse, .free_preparse = ceph_key_free_preparse, .instantiate = generic_key_instantiate, .destroy = ceph_key_destroy, }; int __init ceph_crypto_init(void) { return register_key_type(&key_type_ceph); } void ceph_crypto_shutdown(void) { unregister_key_type(&key_type_ceph); } |
| 1 1 1 1 1 1 2 2 1 3 3 2 1 1 1 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 | // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the VoIP USB phones with CM109 chipsets. * * Copyright (C) 2007 - 2008 Alfred E. Heggestad <aeh@db.org> */ /* * Tested devices: * - Komunikate KIP1000 * - Genius G-talk * - Allied-Telesis Corega USBPH01 * - ... * * This driver is based on the yealink.c driver * * Thanks to: * - Authors of yealink.c * - Thomas Reitmayr * - Oliver Neukum for good review comments and code * - Shaun Jackman <sjackman@gmail.com> for Genius G-talk keymap * - Dmitry Torokhov for valuable input and review * * Todo: * - Read/write EEPROM */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/rwsem.h> #include <linux/usb/input.h> #define DRIVER_VERSION "20080805" #define DRIVER_AUTHOR "Alfred E. Heggestad" #define DRIVER_DESC "CM109 phone driver" static char *phone = "kip1000"; module_param(phone, charp, S_IRUSR); MODULE_PARM_DESC(phone, "Phone name {kip1000, gtalk, usbph01, atcom}"); enum { /* HID Registers */ HID_IR0 = 0x00, /* Record/Playback-mute button, Volume up/down */ HID_IR1 = 0x01, /* GPI, generic registers or EEPROM_DATA0 */ HID_IR2 = 0x02, /* Generic registers or EEPROM_DATA1 */ HID_IR3 = 0x03, /* Generic registers or EEPROM_CTRL */ HID_OR0 = 0x00, /* Mapping control, buzzer, SPDIF (offset 0x04) */ HID_OR1 = 0x01, /* GPO - General Purpose Output */ HID_OR2 = 0x02, /* Set GPIO to input/output mode */ HID_OR3 = 0x03, /* SPDIF status channel or EEPROM_CTRL */ /* HID_IR0 */ RECORD_MUTE = 1 << 3, PLAYBACK_MUTE = 1 << 2, VOLUME_DOWN = 1 << 1, VOLUME_UP = 1 << 0, /* HID_OR0 */ /* bits 7-6 0: HID_OR1-2 are used for GPO; HID_OR0, 3 are used for buzzer and SPDIF 1: HID_OR0-3 are used as generic HID registers 2: Values written to HID_OR0-3 are also mapped to MCU_CTRL, EEPROM_DATA0-1, EEPROM_CTRL (see Note) 3: Reserved */ HID_OR_GPO_BUZ_SPDIF = 0 << 6, HID_OR_GENERIC_HID_REG = 1 << 6, HID_OR_MAP_MCU_EEPROM = 2 << 6, BUZZER_ON = 1 << 5, /* up to 256 normal keys, up to 15 special key combinations */ KEYMAP_SIZE = 256 + 15, }; /* CM109 protocol packet */ struct cm109_ctl_packet { u8 byte[4]; } __attribute__ ((packed)); enum { USB_PKT_LEN = sizeof(struct cm109_ctl_packet) }; /* CM109 device structure */ struct cm109_dev { struct input_dev *idev; /* input device */ struct usb_device *udev; /* usb device */ struct usb_interface *intf; /* irq input channel */ struct cm109_ctl_packet *irq_data; dma_addr_t irq_dma; struct urb *urb_irq; /* control output channel */ struct cm109_ctl_packet *ctl_data; dma_addr_t ctl_dma; struct usb_ctrlrequest *ctl_req; struct urb *urb_ctl; /* * The 3 bitfields below are protected by ctl_submit_lock. * They have to be separate since they are accessed from IRQ * context. */ unsigned irq_urb_pending:1; /* irq_urb is in flight */ unsigned ctl_urb_pending:1; /* ctl_urb is in flight */ unsigned buzzer_pending:1; /* need to issue buzz command */ spinlock_t ctl_submit_lock; unsigned char buzzer_state; /* on/off */ /* flags */ unsigned open:1; unsigned resetting:1; unsigned shutdown:1; /* This mutex protects writes to the above flags */ struct mutex pm_mutex; unsigned short keymap[KEYMAP_SIZE]; char phys[64]; /* physical device path */ int key_code; /* last reported key */ int keybit; /* 0=new scan 1,2,4,8=scan columns */ u8 gpi; /* Cached value of GPI (high nibble) */ }; /****************************************************************************** * CM109 key interface *****************************************************************************/ static unsigned short special_keymap(int code) { if (code > 0xff) { switch (code - 0xff) { case RECORD_MUTE: return KEY_MICMUTE; case PLAYBACK_MUTE: return KEY_MUTE; case VOLUME_DOWN: return KEY_VOLUMEDOWN; case VOLUME_UP: return KEY_VOLUMEUP; } } return KEY_RESERVED; } /* Map device buttons to internal key events. * * The "up" and "down" keys, are symbolised by arrows on the button. * The "pickup" and "hangup" keys are symbolised by a green and red phone * on the button. Komunikate KIP1000 Keyboard Matrix -> -- 1 -- 2 -- 3 --> GPI pin 4 (0x10) | | | | <- -- 4 -- 5 -- 6 --> GPI pin 5 (0x20) | | | | END - 7 -- 8 -- 9 --> GPI pin 6 (0x40) | | | | OK -- * -- 0 -- # --> GPI pin 7 (0x80) | | | | /|\ /|\ /|\ /|\ | | | | GPO pin: 3 2 1 0 0x8 0x4 0x2 0x1 */ static unsigned short keymap_kip1000(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x14: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x11: return KEY_NUMERIC_3; /* 3 */ case 0x24: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x21: return KEY_NUMERIC_6; /* 6 */ case 0x44: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x41: return KEY_NUMERIC_9; /* 9 */ case 0x81: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x88: return KEY_ENTER; /* pickup */ case 0x48: return KEY_ESC; /* hangup */ case 0x28: return KEY_LEFT; /* IN */ case 0x18: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* Contributed by Shaun Jackman <sjackman@gmail.com> Genius G-Talk keyboard matrix 0 1 2 3 4: 0 4 8 Talk 5: 1 5 9 End 6: 2 6 # Up 7: 3 7 * Down */ static unsigned short keymap_gtalk(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; case 0x21: return KEY_NUMERIC_1; case 0x41: return KEY_NUMERIC_2; case 0x81: return KEY_NUMERIC_3; case 0x12: return KEY_NUMERIC_4; case 0x22: return KEY_NUMERIC_5; case 0x42: return KEY_NUMERIC_6; case 0x82: return KEY_NUMERIC_7; case 0x14: return KEY_NUMERIC_8; case 0x24: return KEY_NUMERIC_9; case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* Talk (green handset) */ case 0x28: return KEY_ESC; /* End (red handset) */ case 0x48: return KEY_UP; /* Menu up (rocker switch) */ case 0x88: return KEY_DOWN; /* Menu down (rocker switch) */ default: return special_keymap(scancode); } } /* * Keymap for Allied-Telesis Corega USBPH01 * http://www.alliedtelesis-corega.com/2/1344/1437/1360/chprd.html * * Contributed by july@nat.bg */ static unsigned short keymap_usbph01(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; /* 0 */ case 0x21: return KEY_NUMERIC_1; /* 1 */ case 0x41: return KEY_NUMERIC_2; /* 2 */ case 0x81: return KEY_NUMERIC_3; /* 3 */ case 0x12: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x42: return KEY_NUMERIC_6; /* 6 */ case 0x82: return KEY_NUMERIC_7; /* 7 */ case 0x14: return KEY_NUMERIC_8; /* 8 */ case 0x24: return KEY_NUMERIC_9; /* 9 */ case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* IN */ case 0x88: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* * Keymap for ATCom AU-100 * http://www.atcom.cn/products.html * http://www.packetizer.com/products/au100/ * http://www.voip-info.org/wiki/view/AU-100 * * Contributed by daniel@gimpelevich.san-francisco.ca.us */ static unsigned short keymap_atcom(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x11: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x14: return KEY_NUMERIC_3; /* 3 */ case 0x21: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x24: return KEY_NUMERIC_6; /* 6 */ case 0x41: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x44: return KEY_NUMERIC_9; /* 9 */ case 0x84: return KEY_NUMERIC_POUND; /* # */ case 0x81: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* left arrow */ case 0x88: return KEY_RIGHT; /* right arrow */ default: return special_keymap(scancode); } } static unsigned short (*keymap)(int) = keymap_kip1000; /* * Completes a request by converting the data into events for the * input subsystem. */ static void report_key(struct cm109_dev *dev, int key) { struct input_dev *idev = dev->idev; if (dev->key_code >= 0) { /* old key up */ input_report_key(idev, dev->key_code, 0); } dev->key_code = key; if (key >= 0) { /* new valid key */ input_report_key(idev, key, 1); } input_sync(idev); } /* * Converts data of special key presses (volume, mute) into events * for the input subsystem, sends press-n-release for mute keys. */ static void cm109_report_special(struct cm109_dev *dev) { static const u8 autorelease = RECORD_MUTE | PLAYBACK_MUTE; struct input_dev *idev = dev->idev; u8 data = dev->irq_data->byte[HID_IR0]; unsigned short keycode; int i; for (i = 0; i < 4; i++) { keycode = dev->keymap[0xff + BIT(i)]; if (keycode == KEY_RESERVED) continue; input_report_key(idev, keycode, data & BIT(i)); if (data & autorelease & BIT(i)) { input_sync(idev); input_report_key(idev, keycode, 0); } } input_sync(idev); } /****************************************************************************** * CM109 usb communication interface *****************************************************************************/ static void cm109_submit_buzz_toggle(struct cm109_dev *dev) { int error; if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } static void cm109_submit_ctl(struct cm109_dev *dev) { int error; guard(spinlock_irqsave)(&dev->ctl_submit_lock); dev->irq_urb_pending = 0; if (unlikely(dev->shutdown)) return; if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } /* * IRQ handler */ static void cm109_urb_irq_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; dev_dbg(&dev->intf->dev, "### URB IRQ: [0x%02x 0x%02x 0x%02x 0x%02x] keybit=0x%02x\n", dev->irq_data->byte[0], dev->irq_data->byte[1], dev->irq_data->byte[2], dev->irq_data->byte[3], dev->keybit); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); goto out; } /* Special keys */ cm109_report_special(dev); /* Scan key column */ if (dev->keybit == 0xf) { /* Any changes ? */ if ((dev->gpi & 0xf0) == (dev->irq_data->byte[HID_IR1] & 0xf0)) goto out; dev->gpi = dev->irq_data->byte[HID_IR1] & 0xf0; dev->keybit = 0x1; } else { report_key(dev, dev->keymap[dev->irq_data->byte[HID_IR1]]); dev->keybit <<= 1; if (dev->keybit > 0x8) dev->keybit = 0xf; } out: cm109_submit_ctl(dev); } static void cm109_urb_ctl_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; dev_dbg(&dev->intf->dev, "### URB CTL: [0x%02x 0x%02x 0x%02x 0x%02x]\n", dev->ctl_data->byte[0], dev->ctl_data->byte[1], dev->ctl_data->byte[2], dev->ctl_data->byte[3]); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); } guard(spinlock_irqsave)(&dev->ctl_submit_lock); dev->ctl_urb_pending = 0; if (unlikely(dev->shutdown)) return; if (dev->buzzer_pending || status) { dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } else if (likely(!dev->irq_urb_pending)) { /* ask for key data */ dev->irq_urb_pending = 1; error = usb_submit_urb(dev->urb_irq, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_irq) failed %d\n", __func__, error); } } static void cm109_toggle_buzzer_async(struct cm109_dev *dev) { guard(spinlock_irqsave)(&dev->ctl_submit_lock); if (dev->ctl_urb_pending) { /* URB completion will resubmit */ dev->buzzer_pending = 1; } else { dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } } static void cm109_toggle_buzzer_sync(struct cm109_dev *dev, int on) { int error; if (on) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), dev->ctl_req->bRequest, dev->ctl_req->bRequestType, le16_to_cpu(dev->ctl_req->wValue), le16_to_cpu(dev->ctl_req->wIndex), dev->ctl_data, USB_PKT_LEN, USB_CTRL_SET_TIMEOUT); if (error < 0 && error != -EINTR) dev_err(&dev->intf->dev, "%s: usb_control_msg() failed %d\n", __func__, error); } static void cm109_stop_traffic(struct cm109_dev *dev) { dev->shutdown = 1; /* * Make sure other CPUs see this */ smp_wmb(); usb_kill_urb(dev->urb_ctl); usb_kill_urb(dev->urb_irq); cm109_toggle_buzzer_sync(dev, 0); dev->shutdown = 0; smp_wmb(); } static void cm109_restore_state(struct cm109_dev *dev) { if (dev->open) { /* * Restore buzzer state. * This will also kick regular URB submission */ cm109_toggle_buzzer_async(dev); } } /****************************************************************************** * input event interface *****************************************************************************/ static int cm109_input_open(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); int error; error = usb_autopm_get_interface(dev->intf); if (error < 0) { dev_err(&idev->dev, "%s - cannot autoresume, result %d\n", __func__, error); return error; } scoped_guard(mutex, &dev->pm_mutex) { dev->buzzer_state = 0; dev->key_code = -1; /* no keys pressed */ dev->keybit = 0xf; /* issue INIT */ dev->ctl_data->byte[HID_OR0] = HID_OR_GPO_BUZ_SPDIF; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->ctl_data->byte[HID_OR3] = 0x00; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_KERNEL); if (!error) { dev->open = 1; return 0; } } dev->ctl_urb_pending = 0; usb_autopm_put_interface(dev->intf); dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); return error; } static void cm109_input_close(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); scoped_guard(mutex, &dev->pm_mutex) { /* * Once we are here event delivery is stopped so we * don't need to worry about someone starting buzzer * again */ cm109_stop_traffic(dev); dev->open = 0; } usb_autopm_put_interface(dev->intf); } static int cm109_input_ev(struct input_dev *idev, unsigned int type, unsigned int code, int value) { struct cm109_dev *dev = input_get_drvdata(idev); dev_dbg(&dev->intf->dev, "input_ev: type=%u code=%u value=%d\n", type, code, value); if (type != EV_SND) return -EINVAL; switch (code) { case SND_TONE: case SND_BELL: dev->buzzer_state = !!value; if (!dev->resetting) cm109_toggle_buzzer_async(dev); return 0; default: return -EINVAL; } } /****************************************************************************** * Linux interface and usb initialisation *****************************************************************************/ struct driver_info { char *name; }; static const struct driver_info info_cm109 = { .name = "CM109 USB driver", }; enum { VENDOR_ID = 0x0d8c, /* C-Media Electronics */ PRODUCT_ID_CM109 = 0x000e, /* CM109 defines range 0x0008 - 0x000f */ }; /* table of devices that work with this driver */ static const struct usb_device_id cm109_usb_table[] = { { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = VENDOR_ID, .idProduct = PRODUCT_ID_CM109, .bInterfaceClass = USB_CLASS_HID, .bInterfaceSubClass = 0, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t) &info_cm109 }, /* you can add more devices here with product ID 0x0008 - 0x000f */ { } }; static void cm109_usb_cleanup(struct cm109_dev *dev) { kfree(dev->ctl_req); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->ctl_data, dev->ctl_dma); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->irq_data, dev->irq_dma); usb_free_urb(dev->urb_irq); /* parameter validation in core/urb */ usb_free_urb(dev->urb_ctl); /* parameter validation in core/urb */ kfree(dev); } static void cm109_usb_disconnect(struct usb_interface *interface) { struct cm109_dev *dev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); input_unregister_device(dev->idev); cm109_usb_cleanup(dev); } static int cm109_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct driver_info *nfo = (struct driver_info *)id->driver_info; struct usb_host_interface *interface; struct usb_endpoint_descriptor *endpoint; struct cm109_dev *dev; struct input_dev *input_dev = NULL; int ret, pipe, i; int error = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 1) return -ENODEV; endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -ENODEV; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; spin_lock_init(&dev->ctl_submit_lock); mutex_init(&dev->pm_mutex); dev->udev = udev; dev->intf = intf; dev->idev = input_dev = input_allocate_device(); if (!input_dev) goto err_out; /* allocate usb buffers */ dev->irq_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->irq_dma); if (!dev->irq_data) goto err_out; dev->ctl_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->ctl_dma); if (!dev->ctl_data) goto err_out; dev->ctl_req = kmalloc(sizeof(*(dev->ctl_req)), GFP_KERNEL); if (!dev->ctl_req) goto err_out; /* allocate urb structures */ dev->urb_irq = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_irq) goto err_out; dev->urb_ctl = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_ctl) goto err_out; /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); ret = usb_maxpacket(udev, pipe); if (ret != USB_PKT_LEN) dev_err(&intf->dev, "invalid payload size %d, expected %d\n", ret, USB_PKT_LEN); /* initialise irq urb */ usb_fill_int_urb(dev->urb_irq, udev, pipe, dev->irq_data, USB_PKT_LEN, cm109_urb_irq_callback, dev, endpoint->bInterval); dev->urb_irq->transfer_dma = dev->irq_dma; dev->urb_irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_irq->dev = udev; /* initialise ctl urb */ dev->ctl_req->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT; dev->ctl_req->bRequest = USB_REQ_SET_CONFIGURATION; dev->ctl_req->wValue = cpu_to_le16(0x200); dev->ctl_req->wIndex = cpu_to_le16(interface->desc.bInterfaceNumber); dev->ctl_req->wLength = cpu_to_le16(USB_PKT_LEN); usb_fill_control_urb(dev->urb_ctl, udev, usb_sndctrlpipe(udev, 0), (void *)dev->ctl_req, dev->ctl_data, USB_PKT_LEN, cm109_urb_ctl_callback, dev); dev->urb_ctl->transfer_dma = dev->ctl_dma; dev->urb_ctl->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_ctl->dev = udev; /* find out the physical bus location */ usb_make_path(udev, dev->phys, sizeof(dev->phys)); strlcat(dev->phys, "/input0", sizeof(dev->phys)); /* register settings for the input device */ input_dev->name = nfo->name; input_dev->phys = dev->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, dev); input_dev->open = cm109_input_open; input_dev->close = cm109_input_close; input_dev->event = cm109_input_ev; input_dev->keycode = dev->keymap; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(dev->keymap); input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_SND); input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); /* register available key events */ for (i = 0; i < KEYMAP_SIZE; i++) { unsigned short k = keymap(i); dev->keymap[i] = k; __set_bit(k, input_dev->keybit); } __clear_bit(KEY_RESERVED, input_dev->keybit); error = input_register_device(dev->idev); if (error) goto err_out; usb_set_intfdata(intf, dev); return 0; err_out: input_free_device(input_dev); cm109_usb_cleanup(dev); return error; } static int cm109_usb_suspend(struct usb_interface *intf, pm_message_t message) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_suspend (event=%d)\n", message.event); guard(mutex)(&dev->pm_mutex); cm109_stop_traffic(dev); return 0; } static int cm109_usb_resume(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_resume\n"); guard(mutex)(&dev->pm_mutex); cm109_restore_state(dev); return 0; } static int cm109_usb_pre_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); mutex_lock(&dev->pm_mutex); /* * Make sure input events don't try to toggle buzzer * while we are resetting */ dev->resetting = 1; smp_wmb(); cm109_stop_traffic(dev); return 0; } static int cm109_usb_post_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev->resetting = 0; smp_wmb(); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static struct usb_driver cm109_driver = { .name = "cm109", .probe = cm109_usb_probe, .disconnect = cm109_usb_disconnect, .suspend = cm109_usb_suspend, .resume = cm109_usb_resume, .reset_resume = cm109_usb_resume, .pre_reset = cm109_usb_pre_reset, .post_reset = cm109_usb_post_reset, .id_table = cm109_usb_table, .supports_autosuspend = 1, }; static int __init cm109_select_keymap(void) { /* Load the phone keymap */ if (!strcasecmp(phone, "kip1000")) { keymap = keymap_kip1000; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Komunikate KIP1000 phone loaded\n"); } else if (!strcasecmp(phone, "gtalk")) { keymap = keymap_gtalk; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Genius G-talk phone loaded\n"); } else if (!strcasecmp(phone, "usbph01")) { keymap = keymap_usbph01; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Allied-Telesis Corega USBPH01 phone loaded\n"); } else if (!strcasecmp(phone, "atcom")) { keymap = keymap_atcom; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for ATCom AU-100 phone loaded\n"); } else { printk(KERN_ERR KBUILD_MODNAME ": " "Unsupported phone: %s\n", phone); return -EINVAL; } return 0; } static int __init cm109_init(void) { int err; err = cm109_select_keymap(); if (err) return err; err = usb_register(&cm109_driver); if (err) return err; printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_DESC ": " DRIVER_VERSION " (C) " DRIVER_AUTHOR "\n"); return 0; } static void __exit cm109_exit(void) { usb_deregister(&cm109_driver); } module_init(cm109_init); module_exit(cm109_exit); MODULE_DEVICE_TABLE(usb, cm109_usb_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
| 1293 1295 1293 121 127 125 126 126 126 126 126 126 126 126 130 146 37 37 36 130 37 35 130 34 34 1 34 126 126 126 126 126 126 126 126 31 31 30 30 29 30 30 30 17 30 9 30 30 31 30 126 126 126 125 126 126 126 126 126 8 152 151 110 51 151 149 137 136 136 18 18 17 4 4 4 124 124 121 121 121 121 121 121 120 117 121 121 1 120 12 121 148 129 130 34 130 34 33 1 34 34 16 34 146 13 4 13 13 124 124 1 124 124 124 124 124 124 1 123 123 122 10 10 10 10 10 3 10 10 10 122 121 115 112 115 115 114 19 97 124 124 124 124 5 124 19 5 5 5 5 1 1 1 4 124 124 124 124 124 117 117 1 117 118 157 9 150 1 149 25 130 130 126 126 126 125 124 114 114 25 124 121 139 147 88 88 74 87 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/rseq.h> #include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> /* For vma exec functions. */ #include "../mm/internal.h" static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { /* If it's an anonymous inode make sure that we catch any shenanigans. */ VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) && !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC)); return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; struct vm_area_struct *vma = bprm->vma; struct mm_struct *mm = bprm->mm; int ret; /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * ahead of time. */ if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) return NULL; /* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm. */ ret = get_user_pages_remote(mm, pos, 1, write ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); #ifndef CONFIG_MMU bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); #else err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); if (err) goto err; #endif return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static inline int bprm_set_stack_limit(struct linux_binprm *bprm, unsigned long limit) { #ifdef CONFIG_MMU /* Avoid a pathological bprm->p. */ if (bprm->p < limit) return -E2BIG; bprm->argmin = bprm->p - limit; #endif return 0; } static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm) { #ifdef CONFIG_MMU return bprm->p < bprm->argmin; #else return false; #endif } /* * Calculate bprm->argmin from: * - _STK_LIM * - ARG_MAX * - bprm->rlim_stack.rlim_cur * - bprm->argc * - bprm->envc * - bprm->p */ static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* Reject totally pathological counts. */ if (bprm->argc < 0 || bprm->envc < 0) return -E2BIG; /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) || check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) return -E2BIG; if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; return bprm_set_stack_limit(bprm, limit); } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) goto out; while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; vm_flags_t vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ if (current->flags & PF_RANDOMIZE) stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; vma_iter_init(&vmi, mm, vma->vm_start); tlb_gather_mmu(&tlb, mm); ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. */ ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; stack_expand = min(rlim_stack, stack_size + stack_expand); #ifdef CONFIG_STACK_GROWSUP stack_base = vma->vm_start + stack_expand; #else stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap_local(src); if (ret) goto out; } bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ /* * On success, caller must call do_close_execat() on the returned * struct file to close it. */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { int err; struct file *file __free(fput) = NULL; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) return file; if (path_noexec(&file->f_path)) return ERR_PTR(-EACCES); /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode))) return ERR_PTR(-EACCES); err = exe_file_deny_write_access(file); if (err) return ERR_PTR(err); return no_free_ptr(file); } /** * open_exec - Open a path name for execution * * @name: path name to open with the intent of executing it. * * Returns ERR_PTR on failure or allocated struct file on success. * * As this is a wrapper for the internal do_open_execat(), callers * must call exe_file_allow_write_access() before fput() on release. Also see * do_close_execat(). */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec. */ ret = mmap_read_lock_killable(old_mm); if (ret) { up_write(&tsk->signal->exec_update_lock); return ret; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm, tsk); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop_lazy_tlb(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } /* * This is unlocked -- the string will always be NUL-terminated, but * may show overlapping contents if racing concurrent reads. */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { size_t len = min(strlen(buf), sizeof(tsk->comm) - 1); trace_task_rename(tsk, buf); memcpy(tsk->comm, buf, len); memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * This tracepoint marks the point before flushing the old exec where * the current task is still unchanged, but errors are fatal (point of * no return). The later "sched_process_exec" tracepoint is called after * the current task has successfully switched to the new exec. */ trace_sched_prepare_exec(current, bprm); /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* Make this the only thread in the thread group */ retval = de_thread(me); if (retval) goto out; /* see the comment in check_unsafe_exec() */ current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; retval = exec_task_namespaces(); if (retval) goto out_unlock; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); /* * If the original filename was empty, alloc_bprm() made up a path * that will probably not be useful to admins running ps or similar. * Let's fix it up to be something reasonable. */ if (bprm->comm_from_dentry) { /* * Hold RCU lock to keep the name from being freed behind our back. * Use acquire semantics to make sure the terminating NUL from * __d_alloc() is seen. * * Note, we're deliberately sloppy here. We don't need to care about * detecting a concurrent rename and just want a terminated name. */ rcu_read_lock(); __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), true); rcu_read_unlock(); } else { __set_task_comm(me, kbasename(bprm->filename), true); } /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct mnt_idmap *idmap = file_mnt_idmap(file); if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(¤t->signal->cred_guard_mutex); return -ENOMEM; } /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { if (!file) return; exe_file_allow_write_access(file); fput(file); } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { /* in case exec fails before de_thread() succeeds */ current->fs->in_exec = 0; mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { struct linux_binprm *bprm; struct file *file; int retval = -ENOMEM; file = do_open_execat(fd, filename, flags); if (IS_ERR(file)) return ERR_CAST(file); bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { do_close_execat(file); return ERR_PTR(-ENOMEM); } bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); bprm->comm_from_dentry = 1; } else { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); } if (!bprm->fdpath) goto out_free; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; /* * At this point, security_file_open() has already been called (with * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will * stop just after the security_bprm_creds_for_exec() call in * bprm_execve(). Indeed, the kernel should not try to parse the * content of the file with exec_binprm() nor change the calling * thread, which means that the following security functions will not * be called: * - security_bprm_check() * - security_bprm_creds_from_file() * - security_bprm_committing_creds() * - security_bprm_committed_creds() */ bprm->is_check = !!(flags & AT_EXECVE_CHECK); retval = bprm_mm_init(bprm); if (!retval) return bprm; out_free: free_bprm(bprm); return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; /* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... * * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) * from another sub-thread until de_thread() succeeds, this * state is protected by cred_guard_mutex we hold. */ n_fs = 1; read_seqlock_excl(&p->fs->seq); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; read_sequnlock_excl(&p->fs->seq); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; int err; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); /* Did the exec bit vanish out from under us? Give up. */ if (err) return; /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) return -EFAULT; kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; return 0; } EXPORT_SYMBOL(remove_arg_zero); /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); return -ENOEXEC; } /* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; exe_file_allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } static int bprm_execve(struct linux_binprm *bprm) { int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; /* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated. */ check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current); sched_exec(); /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval || bprm->is_check) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; sched_mm_cid_after_execve(current); rseq_execve(current); /* execve succeeded */ current->in_execve = 0; user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); rseq_set_notify_resume(current); current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); } retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return -EINVAL; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif #ifdef CONFIG_SYSCTL static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) validate_coredump_safety(); return error; } static const struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); return 0; } fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */ #ifdef CONFIG_EXEC_KUNIT_TEST #include "tests/exec_kunit.c" #endif |
| 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 10 10 5 10 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Clock domain and sample rate management functions */ #include <linux/bitops.h> #include <linux/init.h> #include <linux/string.h> #include <linux/usb.h> #include <linux/usb/audio.h> #include <linux/usb/audio-v2.h> #include <linux/usb/audio-v3.h> #include <sound/core.h> #include <sound/info.h> #include <sound/pcm.h> #include "usbaudio.h" #include "card.h" #include "helper.h" #include "clock.h" #include "quirks.h" union uac23_clock_source_desc { struct uac_clock_source_descriptor v2; struct uac3_clock_source_descriptor v3; }; union uac23_clock_selector_desc { struct uac_clock_selector_descriptor v2; struct uac3_clock_selector_descriptor v3; }; union uac23_clock_multiplier_desc { struct uac_clock_multiplier_descriptor v2; struct uac_clock_multiplier_descriptor v3; }; /* check whether the descriptor bLength has the minimal length */ #define DESC_LENGTH_CHECK(p, proto) \ ((proto) == UAC_VERSION_3 ? \ ((p)->v3.bLength >= sizeof((p)->v3)) : \ ((p)->v2.bLength >= sizeof((p)->v2))) #define GET_VAL(p, proto, field) \ ((proto) == UAC_VERSION_3 ? (p)->v3.field : (p)->v2.field) static void *find_uac_clock_desc(struct usb_host_interface *iface, int id, bool (*validator)(void *, int, int), u8 type, int proto) { void *cs = NULL; while ((cs = snd_usb_find_csint_desc(iface->extra, iface->extralen, cs, type))) { if (validator(cs, id, proto)) return cs; } return NULL; } static bool validate_clock_source(void *p, int id, int proto) { union uac23_clock_source_desc *cs = p; if (!DESC_LENGTH_CHECK(cs, proto)) return false; return GET_VAL(cs, proto, bClockID) == id; } static bool validate_clock_selector(void *p, int id, int proto) { union uac23_clock_selector_desc *cs = p; if (!DESC_LENGTH_CHECK(cs, proto)) return false; if (GET_VAL(cs, proto, bClockID) != id) return false; /* additional length check for baCSourceID array (in bNrInPins size) * and two more fields (which sizes depend on the protocol) */ if (proto == UAC_VERSION_3) return cs->v3.bLength >= sizeof(cs->v3) + cs->v3.bNrInPins + 4 /* bmControls */ + 2 /* wCSelectorDescrStr */; else return cs->v2.bLength >= sizeof(cs->v2) + cs->v2.bNrInPins + 1 /* bmControls */ + 1 /* iClockSelector */; } static bool validate_clock_multiplier(void *p, int id, int proto) { union uac23_clock_multiplier_desc *cs = p; if (!DESC_LENGTH_CHECK(cs, proto)) return false; return GET_VAL(cs, proto, bClockID) == id; } #define DEFINE_FIND_HELPER(name, obj, validator, type2, type3) \ static obj *name(struct snd_usb_audio *chip, int id, \ const struct audioformat *fmt) \ { \ struct usb_host_interface *ctrl_intf = \ snd_usb_find_ctrl_interface(chip, fmt->iface); \ return find_uac_clock_desc(ctrl_intf, id, validator, \ fmt->protocol == UAC_VERSION_3 ? (type3) : (type2), \ fmt->protocol); \ } DEFINE_FIND_HELPER(snd_usb_find_clock_source, union uac23_clock_source_desc, validate_clock_source, UAC2_CLOCK_SOURCE, UAC3_CLOCK_SOURCE); DEFINE_FIND_HELPER(snd_usb_find_clock_selector, union uac23_clock_selector_desc, validate_clock_selector, UAC2_CLOCK_SELECTOR, UAC3_CLOCK_SELECTOR); DEFINE_FIND_HELPER(snd_usb_find_clock_multiplier, union uac23_clock_multiplier_desc, validate_clock_multiplier, UAC2_CLOCK_MULTIPLIER, UAC3_CLOCK_MULTIPLIER); static int uac_clock_selector_get_val(struct snd_usb_audio *chip, int selector_id, int iface_no) { struct usb_host_interface *ctrl_intf; unsigned char buf; int ret; ctrl_intf = snd_usb_find_ctrl_interface(chip, iface_no); ret = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), UAC2_CS_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN, UAC2_CX_CLOCK_SELECTOR << 8, snd_usb_ctrl_intf(ctrl_intf) | (selector_id << 8), &buf, sizeof(buf)); if (ret < 0) return ret; return buf; } static int uac_clock_selector_set_val(struct snd_usb_audio *chip, int selector_id, unsigned char pin, int iface_no) { struct usb_host_interface *ctrl_intf; int ret; ctrl_intf = snd_usb_find_ctrl_interface(chip, iface_no); ret = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC2_CS_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT, UAC2_CX_CLOCK_SELECTOR << 8, snd_usb_ctrl_intf(ctrl_intf) | (selector_id << 8), &pin, sizeof(pin)); if (ret < 0) return ret; if (ret != sizeof(pin)) { usb_audio_err(chip, "setting selector (id %d) unexpected length %d\n", selector_id, ret); return -EINVAL; } ret = uac_clock_selector_get_val(chip, selector_id, iface_no); if (ret < 0) return ret; if (ret != pin) { usb_audio_err(chip, "setting selector (id %d) to %x failed (current: %d)\n", selector_id, pin, ret); return -EINVAL; } return ret; } static bool uac_clock_source_is_valid_quirk(struct snd_usb_audio *chip, const struct audioformat *fmt, int source_id) { bool ret = false; int count; unsigned char data; struct usb_device *dev = chip->dev; union uac23_clock_source_desc *cs_desc; struct usb_host_interface *ctrl_intf; ctrl_intf = snd_usb_find_ctrl_interface(chip, fmt->iface); cs_desc = snd_usb_find_clock_source(chip, source_id, fmt); if (!cs_desc) return false; if (fmt->protocol == UAC_VERSION_2) { /* * Assume the clock is valid if clock source supports only one * single sample rate, the terminal is connected directly to it * (there is no clock selector) and clock type is internal. * This is to deal with some Denon DJ controllers that always * reports that clock is invalid. */ if (fmt->nr_rates == 1 && (fmt->clock & 0xff) == cs_desc->v2.bClockID && (cs_desc->v2.bmAttributes & 0x3) != UAC_CLOCK_SOURCE_TYPE_EXT) return true; } /* * MOTU MicroBook IIc * Sample rate changes takes more than 2 seconds for this device. Clock * validity request returns false during that period. */ if (chip->usb_id == USB_ID(0x07fd, 0x0004)) { count = 0; while ((!ret) && (count < 50)) { int err; msleep(100); err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC2_CS_CUR, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, UAC2_CS_CONTROL_CLOCK_VALID << 8, snd_usb_ctrl_intf(ctrl_intf) | (source_id << 8), &data, sizeof(data)); if (err < 0) { dev_warn(&dev->dev, "%s(): cannot get clock validity for id %d\n", __func__, source_id); return false; } ret = !!data; count++; } } return ret; } static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, const struct audioformat *fmt, int source_id) { int err; unsigned char data; struct usb_device *dev = chip->dev; u32 bmControls; union uac23_clock_source_desc *cs_desc; struct usb_host_interface *ctrl_intf; ctrl_intf = snd_usb_find_ctrl_interface(chip, fmt->iface); cs_desc = snd_usb_find_clock_source(chip, source_id, fmt); if (!cs_desc) return false; if (fmt->protocol == UAC_VERSION_3) bmControls = le32_to_cpu(cs_desc->v3.bmControls); else bmControls = cs_desc->v2.bmControls; /* If a clock source can't tell us whether it's valid, we assume it is */ if (!uac_v2v3_control_is_readable(bmControls, UAC2_CS_CONTROL_CLOCK_VALID)) return true; err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC2_CS_CUR, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, UAC2_CS_CONTROL_CLOCK_VALID << 8, snd_usb_ctrl_intf(ctrl_intf) | (source_id << 8), &data, sizeof(data)); if (err < 0) { dev_warn(&dev->dev, "%s(): cannot get clock validity for id %d\n", __func__, source_id); return false; } if (data) return true; else return uac_clock_source_is_valid_quirk(chip, fmt, source_id); } static int __uac_clock_find_source(struct snd_usb_audio *chip, const struct audioformat *fmt, int entity_id, unsigned long *visited, bool validate) { union uac23_clock_source_desc *source; union uac23_clock_selector_desc *selector; union uac23_clock_multiplier_desc *multiplier; int ret, i, cur, err, pins, clock_id; const u8 *sources; int proto = fmt->protocol; bool readable, writeable; u32 bmControls; entity_id &= 0xff; if (test_and_set_bit(entity_id, visited)) { usb_audio_warn(chip, "%s(): recursive clock topology detected, id %d.\n", __func__, entity_id); return -EINVAL; } /* first, see if the ID we're looking at is a clock source already */ source = snd_usb_find_clock_source(chip, entity_id, fmt); if (source) { entity_id = GET_VAL(source, proto, bClockID); if (validate && !uac_clock_source_is_valid(chip, fmt, entity_id)) { usb_audio_err(chip, "clock source %d is not valid, cannot use\n", entity_id); return -ENXIO; } return entity_id; } selector = snd_usb_find_clock_selector(chip, entity_id, fmt); if (selector) { pins = GET_VAL(selector, proto, bNrInPins); clock_id = GET_VAL(selector, proto, bClockID); sources = GET_VAL(selector, proto, baCSourceID); cur = 0; if (proto == UAC_VERSION_3) bmControls = le32_to_cpu(*(__le32 *)(&selector->v3.baCSourceID[0] + pins)); else bmControls = *(__u8 *)(&selector->v2.baCSourceID[0] + pins); readable = uac_v2v3_control_is_readable(bmControls, UAC2_CX_CLOCK_SELECTOR); writeable = uac_v2v3_control_is_writeable(bmControls, UAC2_CX_CLOCK_SELECTOR); if (pins == 1) { ret = 1; goto find_source; } /* for now just warn about buggy device */ if (!readable) usb_audio_warn(chip, "%s(): clock selector control is not readable, id %d\n", __func__, clock_id); /* the entity ID we are looking at is a selector. * find out what it currently selects */ ret = uac_clock_selector_get_val(chip, clock_id, fmt->iface); if (ret < 0) { if (!chip->autoclock) return ret; goto find_others; } /* Selector values are one-based */ if (ret > pins || ret < 1) { usb_audio_err(chip, "%s(): selector reported illegal value, id %d, ret %d\n", __func__, clock_id, ret); if (!chip->autoclock) return -EINVAL; goto find_others; } find_source: cur = ret; ret = __uac_clock_find_source(chip, fmt, sources[ret - 1], visited, validate); if (ret > 0) { /* Skip setting clock selector again for some devices */ if (chip->quirk_flags & QUIRK_FLAG_SKIP_CLOCK_SELECTOR || !writeable) return ret; err = uac_clock_selector_set_val(chip, entity_id, cur, fmt->iface); if (err < 0) { if (pins == 1) { usb_audio_dbg(chip, "%s(): selector returned an error, " "assuming a firmware bug, id %d, ret %d\n", __func__, clock_id, err); return ret; } return err; } } if (!validate || ret > 0 || !chip->autoclock) return ret; find_others: if (!writeable) return -ENXIO; /* The current clock source is invalid, try others. */ for (i = 1; i <= pins; i++) { if (i == cur) continue; ret = __uac_clock_find_source(chip, fmt, sources[i - 1], visited, true); if (ret < 0) continue; err = uac_clock_selector_set_val(chip, entity_id, i, fmt->iface); if (err < 0) continue; usb_audio_info(chip, "found and selected valid clock source %d\n", ret); return ret; } return -ENXIO; } /* FIXME: multipliers only act as pass-thru element for now */ multiplier = snd_usb_find_clock_multiplier(chip, entity_id, fmt); if (multiplier) return __uac_clock_find_source(chip, fmt, GET_VAL(multiplier, proto, bCSourceID), visited, validate); return -EINVAL; } /* * For all kinds of sample rate settings and other device queries, * the clock source (end-leaf) must be used. However, clock selectors, * clock multipliers and sample rate converters may be specified as * clock source input to terminal. This functions walks the clock path * to its end and tries to find the source. * * The 'visited' bitfield is used internally to detect recursive loops. * * Returns the clock source UnitID (>=0) on success, or an error. */ int snd_usb_clock_find_source(struct snd_usb_audio *chip, const struct audioformat *fmt, bool validate) { DECLARE_BITMAP(visited, 256); memset(visited, 0, sizeof(visited)); switch (fmt->protocol) { case UAC_VERSION_2: case UAC_VERSION_3: return __uac_clock_find_source(chip, fmt, fmt->clock, visited, validate); default: return -EINVAL; } } static int set_sample_rate_v1(struct snd_usb_audio *chip, const struct audioformat *fmt, int rate) { struct usb_device *dev = chip->dev; unsigned char data[3]; int err, crate; /* if endpoint doesn't have sampling rate control, bail out */ if (!(fmt->attributes & UAC_EP_CS_ATTR_SAMPLE_RATE)) return 0; data[0] = rate; data[1] = rate >> 8; data[2] = rate >> 16; err = snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), UAC_SET_CUR, USB_TYPE_CLASS | USB_RECIP_ENDPOINT | USB_DIR_OUT, UAC_EP_CS_ATTR_SAMPLE_RATE << 8, fmt->endpoint, data, sizeof(data)); if (err < 0) { dev_err(&dev->dev, "%d:%d: cannot set freq %d to ep %#x\n", fmt->iface, fmt->altsetting, rate, fmt->endpoint); return err; } /* Don't check the sample rate for devices which we know don't * support reading */ if (chip->quirk_flags & QUIRK_FLAG_GET_SAMPLE_RATE) return 0; /* the firmware is likely buggy, don't repeat to fail too many times */ if (chip->sample_rate_read_error > 2) return 0; err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC_GET_CUR, USB_TYPE_CLASS | USB_RECIP_ENDPOINT | USB_DIR_IN, UAC_EP_CS_ATTR_SAMPLE_RATE << 8, fmt->endpoint, data, sizeof(data)); if (err < 0) { dev_err(&dev->dev, "%d:%d: cannot get freq at ep %#x\n", fmt->iface, fmt->altsetting, fmt->endpoint); chip->sample_rate_read_error++; return 0; /* some devices don't support reading */ } crate = data[0] | (data[1] << 8) | (data[2] << 16); if (!crate) { dev_info(&dev->dev, "failed to read current rate; disabling the check\n"); chip->sample_rate_read_error = 3; /* three strikes, see above */ return 0; } if (crate != rate) { dev_warn(&dev->dev, "current rate %d is different from the runtime rate %d\n", crate, rate); // runtime->rate = crate; } return 0; } static int get_sample_rate_v2v3(struct snd_usb_audio *chip, int iface, int altsetting, int clock) { struct usb_device *dev = chip->dev; __le32 data; int err; struct usb_host_interface *ctrl_intf; ctrl_intf = snd_usb_find_ctrl_interface(chip, iface); err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC2_CS_CUR, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, UAC2_CS_CONTROL_SAM_FREQ << 8, snd_usb_ctrl_intf(ctrl_intf) | (clock << 8), &data, sizeof(data)); if (err < 0) { dev_warn(&dev->dev, "%d:%d: cannot get freq (v2/v3): err %d\n", iface, altsetting, err); return 0; } return le32_to_cpu(data); } /* * Try to set the given sample rate: * * Return 0 if the clock source is read-only, the actual rate on success, * or a negative error code. * * This function gets called from format.c to validate each sample rate, too. * Hence no message is shown upon error */ int snd_usb_set_sample_rate_v2v3(struct snd_usb_audio *chip, const struct audioformat *fmt, int clock, int rate) { bool writeable; u32 bmControls; __le32 data; int err; union uac23_clock_source_desc *cs_desc; struct usb_host_interface *ctrl_intf; ctrl_intf = snd_usb_find_ctrl_interface(chip, fmt->iface); cs_desc = snd_usb_find_clock_source(chip, clock, fmt); if (!cs_desc) return 0; if (fmt->protocol == UAC_VERSION_3) bmControls = le32_to_cpu(cs_desc->v3.bmControls); else bmControls = cs_desc->v2.bmControls; writeable = uac_v2v3_control_is_writeable(bmControls, UAC2_CS_CONTROL_SAM_FREQ); if (!writeable) return 0; data = cpu_to_le32(rate); err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC2_CS_CUR, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT, UAC2_CS_CONTROL_SAM_FREQ << 8, snd_usb_ctrl_intf(ctrl_intf) | (clock << 8), &data, sizeof(data)); if (err < 0) return err; return get_sample_rate_v2v3(chip, fmt->iface, fmt->altsetting, clock); } static int set_sample_rate_v2v3(struct snd_usb_audio *chip, const struct audioformat *fmt, int rate) { int cur_rate, prev_rate; int clock; /* First, try to find a valid clock. This may trigger * automatic clock selection if the current clock is not * valid. */ clock = snd_usb_clock_find_source(chip, fmt, true); if (clock < 0) { /* We did not find a valid clock, but that might be * because the current sample rate does not match an * external clock source. Try again without validation * and we will do another validation after setting the * rate. */ clock = snd_usb_clock_find_source(chip, fmt, false); /* Hardcoded sample rates */ if (chip->quirk_flags & QUIRK_FLAG_IGNORE_CLOCK_SOURCE) return 0; if (clock < 0) return clock; } prev_rate = get_sample_rate_v2v3(chip, fmt->iface, fmt->altsetting, clock); if (prev_rate == rate) goto validation; cur_rate = snd_usb_set_sample_rate_v2v3(chip, fmt, clock, rate); if (cur_rate < 0) { usb_audio_err(chip, "%d:%d: cannot set freq %d (v2/v3): err %d\n", fmt->iface, fmt->altsetting, rate, cur_rate); return cur_rate; } if (!cur_rate) cur_rate = prev_rate; if (cur_rate != rate) { usb_audio_dbg(chip, "%d:%d: freq mismatch: req %d, clock runs @%d\n", fmt->iface, fmt->altsetting, rate, cur_rate); /* continue processing */ } /* FIXME - TEAC devices require the immediate interface setup */ if (USB_ID_VENDOR(chip->usb_id) == 0x0644) { bool cur_base_48k = (rate % 48000 == 0); bool prev_base_48k = (prev_rate % 48000 == 0); if (cur_base_48k != prev_base_48k) { usb_set_interface(chip->dev, fmt->iface, fmt->altsetting); if (chip->quirk_flags & QUIRK_FLAG_IFACE_DELAY) msleep(50); } } validation: /* validate clock after rate change */ if (!uac_clock_source_is_valid(chip, fmt, clock)) return -ENXIO; return 0; } int snd_usb_init_sample_rate(struct snd_usb_audio *chip, const struct audioformat *fmt, int rate) { usb_audio_dbg(chip, "%d:%d Set sample rate %d, clock %d\n", fmt->iface, fmt->altsetting, rate, fmt->clock); switch (fmt->protocol) { case UAC_VERSION_1: default: return set_sample_rate_v1(chip, fmt, rate); case UAC_VERSION_3: if (chip->badd_profile >= UAC3_FUNCTION_SUBCLASS_GENERIC_IO) { if (rate != UAC3_BADD_SAMPLING_RATE) return -ENXIO; else return 0; } fallthrough; case UAC_VERSION_2: return set_sample_rate_v2v3(chip, fmt, rate); } } |
| 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 | /* SPDX-License-Identifier: GPL-2.0 */ /* * ethtool.h: Defines for Linux ethtool. * * Copyright (C) 1998 David S. Miller (davem@redhat.com) * Copyright 2001 Jeff Garzik <jgarzik@pobox.com> * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) * Portions Copyright 2002 Intel (eli.kupermann@intel.com, * christopher.leech@intel.com, * scott.feldman@intel.com) * Portions Copyright (C) Sun Microsystems 2008 */ #ifndef _LINUX_ETHTOOL_H #define _LINUX_ETHTOOL_H #include <linux/bitmap.h> #include <linux/compat.h> #include <linux/if_ether.h> #include <linux/netlink.h> #include <linux/timer_types.h> #include <uapi/linux/ethtool.h> #include <uapi/linux/ethtool_netlink_generated.h> #include <uapi/linux/net_tstamp.h> #define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 #define ETHTOOL_MM_MAX_VERIFY_RETRIES 3 struct compat_ethtool_rx_flow_spec { u32 flow_type; union ethtool_flow_union h_u; struct ethtool_flow_ext h_ext; union ethtool_flow_union m_u; struct ethtool_flow_ext m_ext; compat_u64 ring_cookie; u32 location; }; struct compat_ethtool_rxnfc { u32 cmd; u32 flow_type; compat_u64 data; struct compat_ethtool_rx_flow_spec fs; u32 rule_cnt; u32 rule_locs[]; }; #include <linux/rculist.h> /** * enum ethtool_phys_id_state - indicator state for physical identification * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated * @ETHTOOL_ID_ACTIVE: Physical ID indicator should be activated * @ETHTOOL_ID_ON: LED should be turned on (used iff %ETHTOOL_ID_ACTIVE * is not supported) * @ETHTOOL_ID_OFF: LED should be turned off (used iff %ETHTOOL_ID_ACTIVE * is not supported) */ enum ethtool_phys_id_state { ETHTOOL_ID_INACTIVE, ETHTOOL_ID_ACTIVE, ETHTOOL_ID_ON, ETHTOOL_ID_OFF }; enum { ETH_RSS_HASH_TOP_BIT, /* Configurable RSS hash function - Toeplitz */ ETH_RSS_HASH_XOR_BIT, /* Configurable RSS hash function - Xor */ ETH_RSS_HASH_CRC32_BIT, /* Configurable RSS hash function - Crc32 */ /* * Add your fresh new hash function bits above and remember to update * rss_hash_func_strings[] in ethtool.c */ ETH_RSS_HASH_FUNCS_COUNT }; /** * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. * @tcp_data_split: Scatter packet headers and data to separate buffers * @tx_push: The flag of tx push mode * @rx_push: The flag of rx push mode * @cqe_size: Size of TX/RX completion queue event * @tx_push_buf_len: Size of TX push buffer * @tx_push_buf_max_len: Maximum allowed size of TX push buffer * @hds_thresh: Packet size threshold for header data split (HDS) * @hds_thresh_max: Maximum supported setting for @hds_threshold * */ struct kernel_ethtool_ringparam { u32 rx_buf_len; u8 tcp_data_split; u8 tx_push; u8 rx_push; u32 cqe_size; u32 tx_push_buf_len; u32 tx_push_buf_max_len; u32 hds_thresh; u32 hds_thresh_max; }; /** * enum ethtool_supported_ring_param - indicator caps for setting ring params * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split * @ETHTOOL_RING_USE_HDS_THRS: capture for setting header-data-split-thresh */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), ETHTOOL_RING_USE_CQE_SIZE = BIT(1), ETHTOOL_RING_USE_TX_PUSH = BIT(2), ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), ETHTOOL_RING_USE_HDS_THRS = BIT(6), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) #define __ETH_RSS_HASH(name) __ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT) #define ETH_RSS_HASH_TOP __ETH_RSS_HASH(TOP) #define ETH_RSS_HASH_XOR __ETH_RSS_HASH(XOR) #define ETH_RSS_HASH_CRC32 __ETH_RSS_HASH(CRC32) #define ETH_RSS_HASH_UNKNOWN 0 #define ETH_RSS_HASH_NO_CHANGE 0 struct net_device; struct netlink_ext_ack; /* Link extended state and substate. */ struct ethtool_link_ext_state_info { enum ethtool_link_ext_state link_ext_state; union { enum ethtool_link_ext_substate_autoneg autoneg; enum ethtool_link_ext_substate_link_training link_training; enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch; enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; enum ethtool_link_ext_substate_cable_issue cable_issue; enum ethtool_link_ext_substate_module module; u32 __link_ext_substate; }; }; struct ethtool_link_ext_stats { /* Custom Linux statistic for PHY level link down events. * In a simpler world it should be equal to netdev->carrier_down_count * unfortunately netdev also counts local reconfigurations which don't * actually take the physical link down, not to mention NC-SI which, * if present, keeps the link up regardless of host state. * This statistic counts when PHY _actually_ went down, or lost link. * * Note that we need u64 for ethtool_stats_init() and comparisons * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user. */ u64 link_down_events; }; /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table * @n_rx_rings: Number of RX rings to use * * This function provides the default policy for RX flow hash indirection. */ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) { return index % n_rx_rings; } /** * struct ethtool_rxfh_context - a custom RSS context configuration * @indir_size: Number of u32 entries in indirection table * @key_size: Size of hash key, in bytes * @priv_size: Size of driver private data, in bytes * @hfunc: RSS hash function identifier. One of the %ETH_RSS_HASH_* * @input_xfrm: Defines how the input data is transformed. Valid values are one * of %RXH_XFRM_*. * @indir_configured: indir has been specified (at create time or subsequently) * @key_configured: hkey has been specified (at create time or subsequently) */ struct ethtool_rxfh_context { u32 indir_size; u32 key_size; u16 priv_size; u8 hfunc; u8 input_xfrm; u8 indir_configured:1; u8 key_configured:1; /* private: driver private data, indirection table, and hash key are * stored sequentially in @data area. Use below helpers to access. */ u32 key_off; u8 data[] __aligned(sizeof(void *)); }; static inline void *ethtool_rxfh_context_priv(struct ethtool_rxfh_context *ctx) { return ctx->data; } static inline u32 *ethtool_rxfh_context_indir(struct ethtool_rxfh_context *ctx) { return (u32 *)(ctx->data + ALIGN(ctx->priv_size, sizeof(u32))); } static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) { return &ctx->data[ctx->key_off]; } void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); struct link_mode_info { int speed; u8 lanes; u8 duplex; }; extern const struct link_mode_info link_mode_params[]; /* declare a link mode bitmap */ #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) /* drivers must ignore base.cmd and base.link_mode_masks_nwords * fields, but they are allowed to overwrite them (will be ignored). */ struct ethtool_link_ksettings { struct ethtool_link_settings base; struct { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); } link_modes; u32 lanes; }; /** * ethtool_link_ksettings_zero_link_mode - clear link_ksettings link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising */ #define ethtool_link_ksettings_zero_link_mode(ptr, name) \ bitmap_zero((ptr)->link_modes.name, __ETHTOOL_LINK_MODE_MASK_NBITS) /** * ethtool_link_ksettings_add_link_mode - set bit in link_ksettings * link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) */ #define ethtool_link_ksettings_add_link_mode(ptr, name, mode) \ __set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) /** * ethtool_link_ksettings_del_link_mode - clear bit in link_ksettings * link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) */ #define ethtool_link_ksettings_del_link_mode(ptr, name, mode) \ __clear_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) /** * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask * @ptr : pointer to struct ethtool_link_ksettings * @name : one of supported/advertising/lp_advertising * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) * * Returns: true/false. */ #define ethtool_link_ksettings_test_link_mode(ptr, name, mode) \ test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) extern int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); struct ethtool_keee { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertised); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertised); u32 tx_lpi_timer; bool tx_lpi_enabled; bool eee_active; bool eee_enabled; }; struct kernel_ethtool_coalesce { u8 use_cqe_mode_tx; u8 use_cqe_mode_rx; u32 tx_aggr_max_bytes; u32 tx_aggr_max_frames; u32 tx_aggr_time_usecs; }; /** * ethtool_intersect_link_masks - Given two link masks, AND them together * @dst: first mask and where result is stored * @src: second mask to intersect with * * Given two link mode masks, AND them together and save the result in dst. */ void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, struct ethtool_link_ksettings *src); void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32); /* return false if src had higher bits set. lower bits always updated. */ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, const unsigned long *src); #define ETHTOOL_COALESCE_RX_USECS BIT(0) #define ETHTOOL_COALESCE_RX_MAX_FRAMES BIT(1) #define ETHTOOL_COALESCE_RX_USECS_IRQ BIT(2) #define ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ BIT(3) #define ETHTOOL_COALESCE_TX_USECS BIT(4) #define ETHTOOL_COALESCE_TX_MAX_FRAMES BIT(5) #define ETHTOOL_COALESCE_TX_USECS_IRQ BIT(6) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ BIT(7) #define ETHTOOL_COALESCE_STATS_BLOCK_USECS BIT(8) #define ETHTOOL_COALESCE_USE_ADAPTIVE_RX BIT(9) #define ETHTOOL_COALESCE_USE_ADAPTIVE_TX BIT(10) #define ETHTOOL_COALESCE_PKT_RATE_LOW BIT(11) #define ETHTOOL_COALESCE_RX_USECS_LOW BIT(12) #define ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW BIT(13) #define ETHTOOL_COALESCE_TX_USECS_LOW BIT(14) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW BIT(15) #define ETHTOOL_COALESCE_PKT_RATE_HIGH BIT(16) #define ETHTOOL_COALESCE_RX_USECS_HIGH BIT(17) #define ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH BIT(18) #define ETHTOOL_COALESCE_TX_USECS_HIGH BIT(19) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH BIT(20) #define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL BIT(21) #define ETHTOOL_COALESCE_USE_CQE_RX BIT(22) #define ETHTOOL_COALESCE_USE_CQE_TX BIT(23) #define ETHTOOL_COALESCE_TX_AGGR_MAX_BYTES BIT(24) #define ETHTOOL_COALESCE_TX_AGGR_MAX_FRAMES BIT(25) #define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26) #define ETHTOOL_COALESCE_RX_PROFILE BIT(27) #define ETHTOOL_COALESCE_TX_PROFILE BIT(28) #define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) #define ETHTOOL_COALESCE_MAX_FRAMES \ (ETHTOOL_COALESCE_RX_MAX_FRAMES | ETHTOOL_COALESCE_TX_MAX_FRAMES) #define ETHTOOL_COALESCE_USECS_IRQ \ (ETHTOOL_COALESCE_RX_USECS_IRQ | ETHTOOL_COALESCE_TX_USECS_IRQ) #define ETHTOOL_COALESCE_MAX_FRAMES_IRQ \ (ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ | \ ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ) #define ETHTOOL_COALESCE_USE_ADAPTIVE \ (ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_ADAPTIVE_TX) #define ETHTOOL_COALESCE_USECS_LOW_HIGH \ (ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_TX_USECS_LOW | \ ETHTOOL_COALESCE_RX_USECS_HIGH | ETHTOOL_COALESCE_TX_USECS_HIGH) #define ETHTOOL_COALESCE_MAX_FRAMES_LOW_HIGH \ (ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW | \ ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW | \ ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH | \ ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH) #define ETHTOOL_COALESCE_PKT_RATE_RX_USECS \ (ETHTOOL_COALESCE_USE_ADAPTIVE_RX | \ ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \ ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \ ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL) #define ETHTOOL_COALESCE_USE_CQE \ (ETHTOOL_COALESCE_USE_CQE_RX | ETHTOOL_COALESCE_USE_CQE_TX) #define ETHTOOL_COALESCE_TX_AGGR \ (ETHTOOL_COALESCE_TX_AGGR_MAX_BYTES | \ ETHTOOL_COALESCE_TX_AGGR_MAX_FRAMES | \ ETHTOOL_COALESCE_TX_AGGR_TIME_USECS) #define ETHTOOL_STAT_NOT_SET (~0ULL) static inline void ethtool_stats_init(u64 *stats, unsigned int n) { while (n--) stats[n] = ETHTOOL_STAT_NOT_SET; } /* Basic IEEE 802.3 MAC statistics (30.3.1.1.*), not otherwise exposed * via a more targeted API. */ struct ethtool_eth_mac_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 FramesTransmittedOK; u64 SingleCollisionFrames; u64 MultipleCollisionFrames; u64 FramesReceivedOK; u64 FrameCheckSequenceErrors; u64 AlignmentErrors; u64 OctetsTransmittedOK; u64 FramesWithDeferredXmissions; u64 LateCollisions; u64 FramesAbortedDueToXSColls; u64 FramesLostDueToIntMACXmitError; u64 CarrierSenseErrors; u64 OctetsReceivedOK; u64 FramesLostDueToIntMACRcvError; u64 MulticastFramesXmittedOK; u64 BroadcastFramesXmittedOK; u64 FramesWithExcessiveDeferral; u64 MulticastFramesReceivedOK; u64 BroadcastFramesReceivedOK; u64 InRangeLengthErrors; u64 OutOfRangeLengthField; u64 FrameTooLongErrors; ); }; /* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed * via a more targeted API. */ struct ethtool_eth_phy_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 SymbolErrorDuringCarrier; ); }; /** * struct ethtool_phy_stats - PHY-level statistics counters * @rx_packets: Total successfully received frames * @rx_bytes: Total successfully received bytes * @rx_errors: Total received frames with errors (e.g., CRC errors) * @tx_packets: Total successfully transmitted frames * @tx_bytes: Total successfully transmitted bytes * @tx_errors: Total transmitted frames with errors * * This structure provides a standardized interface for reporting * PHY-level statistics counters. It is designed to expose statistics * commonly provided by PHYs but not explicitly defined in the IEEE * 802.3 standard. */ struct ethtool_phy_stats { u64 rx_packets; u64 rx_bytes; u64 rx_errors; u64 tx_packets; u64 tx_bytes; u64 tx_errors; }; /* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed * via a more targeted API. */ struct ethtool_eth_ctrl_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 MACControlFramesTransmitted; u64 MACControlFramesReceived; u64 UnsupportedOpcodesReceived; ); }; /** * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames * @src: input field denoting whether stats should be queried from the eMAC or * pMAC (if the MM layer is supported). To be ignored otherwise. * @tx_pause_frames: transmitted pause frame count. Reported to user space * as %ETHTOOL_A_PAUSE_STAT_TX_FRAMES. * * Equivalent to `30.3.4.2 aPAUSEMACCtrlFramesTransmitted` * from the standard. * * @rx_pause_frames: received pause frame count. Reported to user space * as %ETHTOOL_A_PAUSE_STAT_RX_FRAMES. Equivalent to: * * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` * from the standard. */ struct ethtool_pause_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 tx_pause_frames; u64 rx_pause_frames; ); }; #define ETHTOOL_MAX_LANES 8 /** * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC * @corrected_blocks: number of received blocks corrected by FEC * Reported to user space as %ETHTOOL_A_FEC_STAT_CORRECTED. * * Equivalent to `30.5.1.1.17 aFECCorrectedBlocks` from the standard. * * @uncorrectable_blocks: number of received blocks FEC was not able to correct * Reported to user space as %ETHTOOL_A_FEC_STAT_UNCORR. * * Equivalent to `30.5.1.1.18 aFECUncorrectableBlocks` from the standard. * * @corrected_bits: number of bits corrected by FEC * Similar to @corrected_blocks but counts individual bit changes, * not entire FEC data blocks. This is a non-standard statistic. * Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS. * * For each of the above fields, the two substructure members are: * * - @lanes: per-lane/PCS-instance counts as defined by the standard * - @total: error counts for the entire port, for drivers incapable of reporting * per-lane stats * * Drivers should fill in either only total or per-lane statistics, core * will take care of adding lane values up to produce the total. */ struct ethtool_fec_stats { struct ethtool_fec_stat { u64 total; u64 lanes[ETHTOOL_MAX_LANES]; } corrected_blocks, uncorrectable_blocks, corrected_bits; }; /** * struct ethtool_rmon_hist_range - byte range for histogram statistics * @low: low bound of the bucket (inclusive) * @high: high bound of the bucket (inclusive) */ struct ethtool_rmon_hist_range { u16 low; u16 high; }; #define ETHTOOL_RMON_HIST_MAX 11 /** * struct ethtool_rmon_stats - selected RMON (RFC 2819) statistics * @src: input field denoting whether stats should be queried from the eMAC or * pMAC (if the MM layer is supported). To be ignored otherwise. * @undersize_pkts: Equivalent to `etherStatsUndersizePkts` from the RFC. * @oversize_pkts: Equivalent to `etherStatsOversizePkts` from the RFC. * @fragments: Equivalent to `etherStatsFragments` from the RFC. * @jabbers: Equivalent to `etherStatsJabbers` from the RFC. * @hist: Packet counter for packet length buckets (e.g. * `etherStatsPkts128to255Octets` from the RFC). * @hist_tx: Tx counters in similar form to @hist, not defined in the RFC. * * Selection of RMON (RFC 2819) statistics which are not exposed via different * APIs, primarily the packet-length-based counters. * Unfortunately different designs choose different buckets beyond * the 1024B mark (jumbo frame teritory), so the definition of the bucket * ranges is left to the driver. */ struct ethtool_rmon_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 undersize_pkts; u64 oversize_pkts; u64 fragments; u64 jabbers; u64 hist[ETHTOOL_RMON_HIST_MAX]; u64 hist_tx[ETHTOOL_RMON_HIST_MAX]; ); }; /** * struct ethtool_ts_stats - HW timestamping statistics * @pkts: Number of packets successfully timestamped by the hardware. * @onestep_pkts_unconfirmed: Number of PTP packets with one-step TX * timestamping that were sent, but for which the * device offers no confirmation whether they made * it onto the wire and the timestamp was inserted * in the originTimestamp or correctionField, or * not. * @lost: Number of hardware timestamping requests where the timestamping * information from the hardware never arrived for submission with * the skb. * @err: Number of arbitrary timestamp generation error events that the * hardware encountered, exclusive of @lost statistics. Cases such * as resource exhaustion, unavailability, firmware errors, and * detected illogical timestamp values not submitted with the skb * are inclusive to this counter. */ struct ethtool_ts_stats { struct_group(tx_stats, u64 pkts; u64 onestep_pkts_unconfirmed; u64 lost; u64 err; ); }; #define ETH_MODULE_EEPROM_PAGE_LEN 128 #define ETH_MODULE_MAX_I2C_ADDRESS 0x7f /** * struct ethtool_module_eeprom - plug-in module EEPROM read / write parameters * @offset: When @offset is 0-127, it is used as an address to the Lower Memory * (@page must be 0). Otherwise, it is used as an address to the * Upper Memory. * @length: Number of bytes to read / write. * @page: Page number. * @bank: Bank number, if supported by EEPROM spec. * @i2c_address: I2C address of a page. Value less than 0x7f expected. Most * EEPROMs use 0x50 or 0x51. * @data: Pointer to buffer with EEPROM data of @length size. */ struct ethtool_module_eeprom { u32 offset; u32 length; u8 page; u8 bank; u8 i2c_address; u8 *data; }; /** * struct ethtool_module_power_mode_params - module power mode parameters * @policy: The power mode policy enforced by the host for the plug-in module. * @mode: The operational power mode of the plug-in module. Should be filled by * device drivers on get operations. */ struct ethtool_module_power_mode_params { enum ethtool_module_power_mode_policy policy; enum ethtool_module_power_mode mode; }; /** * struct ethtool_mm_state - 802.3 MAC merge layer state * @verify_time: * wait time between verification attempts in ms (according to clause * 30.14.1.6 aMACMergeVerifyTime) * @max_verify_time: * maximum accepted value for the @verify_time variable in set requests * @verify_status: * state of the verification state machine of the MM layer (according to * clause 30.14.1.2 aMACMergeStatusVerify) * @tx_enabled: * set if the MM layer is administratively enabled in the TX direction * (according to clause 30.14.1.3 aMACMergeEnableTx) * @tx_active: * set if the MM layer is enabled in the TX direction, which makes FP * possible (according to 30.14.1.5 aMACMergeStatusTx). This should be * true if MM is enabled, and the verification status is either verified, * or disabled. * @pmac_enabled: * set if the preemptible MAC is powered on and is able to receive * preemptible packets and respond to verification frames. * @verify_enabled: * set if the Verify function of the MM layer (which sends SMD-V * verification requests) is administratively enabled (regardless of * whether it is currently in the ETHTOOL_MM_VERIFY_STATUS_DISABLED state * or not), according to clause 30.14.1.4 aMACMergeVerifyDisableTx (but * using positive rather than negative logic). The device should always * respond to received SMD-V requests as long as @pmac_enabled is set. * @tx_min_frag_size: * the minimum size of non-final mPacket fragments that the link partner * supports receiving, expressed in octets. Compared to the definition * from clause 30.14.1.7 aMACMergeAddFragSize which is expressed in the * range 0 to 3 (requiring a translation to the size in octets according * to the formula 64 * (1 + addFragSize) - 4), a value in a continuous and * unbounded range can be specified here. * @rx_min_frag_size: * the minimum size of non-final mPacket fragments that this device * supports receiving, expressed in octets. */ struct ethtool_mm_state { u32 verify_time; u32 max_verify_time; enum ethtool_mm_verify_status verify_status; bool tx_enabled; bool tx_active; bool pmac_enabled; bool verify_enabled; u32 tx_min_frag_size; u32 rx_min_frag_size; }; /** * struct ethtool_mm_cfg - 802.3 MAC merge layer configuration * @verify_time: see struct ethtool_mm_state * @verify_enabled: see struct ethtool_mm_state * @tx_enabled: see struct ethtool_mm_state * @pmac_enabled: see struct ethtool_mm_state * @tx_min_frag_size: see struct ethtool_mm_state */ struct ethtool_mm_cfg { u32 verify_time; bool verify_enabled; bool tx_enabled; bool pmac_enabled; u32 tx_min_frag_size; }; /** * struct ethtool_mm_stats - 802.3 MAC merge layer statistics * @MACMergeFrameAssErrorCount: * received MAC frames with reassembly errors * @MACMergeFrameSmdErrorCount: * received MAC frames/fragments rejected due to unknown or incorrect SMD * @MACMergeFrameAssOkCount: * received MAC frames that were successfully reassembled and passed up * @MACMergeFragCountRx: * number of additional correct SMD-C mPackets received due to preemption * @MACMergeFragCountTx: * number of additional mPackets sent due to preemption * @MACMergeHoldCount: * number of times the MM layer entered the HOLD state, which blocks * transmission of preemptible traffic */ struct ethtool_mm_stats { u64 MACMergeFrameAssErrorCount; u64 MACMergeFrameSmdErrorCount; u64 MACMergeFrameAssOkCount; u64 MACMergeFragCountRx; u64 MACMergeFragCountTx; u64 MACMergeHoldCount; }; enum ethtool_mmsv_event { ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET, ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET, ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET, }; /* MAC Merge verification mPacket type */ enum ethtool_mpacket { ETHTOOL_MPACKET_VERIFY, ETHTOOL_MPACKET_RESPONSE, }; struct ethtool_mmsv; /** * struct ethtool_mmsv_ops - Operations for MAC Merge Software Verification * @configure_tx: Driver callback for the event where the preemptible TX * becomes active or inactive. Preemptible traffic * classes must be committed to hardware only while * preemptible TX is active. * @configure_pmac: Driver callback for the event where the pMAC state * changes as result of an administrative setting * (ethtool) or a call to ethtool_mmsv_link_state_handle(). * @send_mpacket: Driver-provided method for sending a Verify or a Response * mPacket. */ struct ethtool_mmsv_ops { void (*configure_tx)(struct ethtool_mmsv *mmsv, bool tx_active); void (*configure_pmac)(struct ethtool_mmsv *mmsv, bool pmac_enabled); void (*send_mpacket)(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket); }; /** * struct ethtool_mmsv - MAC Merge Software Verification * @ops: operations for MAC Merge Software Verification * @dev: pointer to net_device structure * @lock: serialize access to MAC Merge state between * ethtool requests and link state updates. * @status: current verification FSM state * @verify_timer: timer for verification in local TX direction * @verify_enabled: indicates if verification is enabled * @verify_retries: number of retries for verification * @pmac_enabled: indicates if the preemptible MAC is enabled * @verify_time: time for verification in milliseconds * @tx_enabled: indicates if transmission is enabled */ struct ethtool_mmsv { const struct ethtool_mmsv_ops *ops; struct net_device *dev; spinlock_t lock; enum ethtool_mm_verify_status status; struct timer_list verify_timer; bool verify_enabled; int verify_retries; bool pmac_enabled; u32 verify_time; bool tx_enabled; }; void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv); void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up); void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, enum ethtool_mmsv_event event); void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_state *state); void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg); void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, const struct ethtool_mmsv_ops *ops); /** * struct ethtool_rxfh_param - RXFH (RSS) parameters * @hfunc: Defines the current RSS hash function used by HW (or to be set to). * Valid values are one of the %ETH_RSS_HASH_*. * @indir_size: On SET, the array size of the user buffer for the * indirection table, which may be zero, or * %ETH_RXFH_INDIR_NO_CHANGE. On GET (read from the driver), * the array size of the hardware indirection table. * @indir: The indirection table of size @indir_size entries. * @key_size: On SET, the array size of the user buffer for the hash key, * which may be zero. On GET (read from the driver), the size of the * hardware hash key. * @key: The hash key of size @key_size bytes. * @rss_context: RSS context identifier. Context 0 is the default for normal * traffic; other contexts can be referenced as the destination for RX flow * classification rules. On SET, %ETH_RXFH_CONTEXT_ALLOC is used * to allocate a new RSS context; on return this field will * contain the ID of the newly allocated context. * @rss_delete: Set to non-ZERO to remove the @rss_context context. * @input_xfrm: Defines how the input data is transformed. Valid values are one * of %RXH_XFRM_*. */ struct ethtool_rxfh_param { u8 hfunc; u32 indir_size; u32 *indir; u32 key_size; u8 *key; u32 rss_context; u8 rss_delete; u8 input_xfrm; }; /** * struct ethtool_rxfh_fields - Rx Flow Hashing (RXFH) header field config * @data: which header fields are used for hashing, bitmask of RXH_* defines * @flow_type: L2-L4 network traffic flow type * @rss_context: RSS context, will only be used if rxfh_per_ctx_fields is * set in struct ethtool_ops */ struct ethtool_rxfh_fields { u32 data; u32 flow_type; u32 rss_context; }; /** * struct kernel_ethtool_ts_info - kernel copy of struct ethtool_ts_info * @cmd: command number = %ETHTOOL_GET_TS_INFO * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none * @phc_qualifier: qualifier of the associated PHC * @phc_source: source device of the associated PHC * @phc_phyindex: index of PHY device source of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ struct kernel_ethtool_ts_info { u32 cmd; u32 so_timestamping; int phc_index; enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_source phc_source; int phc_phyindex; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; /** * struct ethtool_ops - optional netdev operations * @supported_input_xfrm: supported types of input xfrm from %RXH_XFRM_*. * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. * @rxfh_per_ctx_fields: device supports selecting different header fields * for Rx hash calculation and RSS for each additional context. * @rxfh_per_ctx_key: device supports setting different RSS key for each * additional context. Netlink API should report hfunc, key, and input_xfrm * for every context, not just context 0. * @cap_rss_rxnfc_adds: device supports nonzero ring_cookie in filters with * %FLOW_RSS flag; the queue ID from the filter is added to the value from * the indirection table to determine the delivery queue. * @rxfh_indir_space: max size of RSS indirection tables, if indirection table * size as returned by @get_rxfh_indir_size may change during lifetime * of the device. Leave as 0 if the table size is constant. * @rxfh_key_space: same as @rxfh_indir_space, but for the key. * @rxfh_priv_size: size of the driver private data area the core should * allocate for an RSS context (in &struct ethtool_rxfh_context). * @rxfh_max_num_contexts: maximum (exclusive) supported RSS context ID. * If this is zero then the core may choose any (nonzero) ID, otherwise * the core will only use IDs strictly less than this value, as the * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @supported_hwtstamp_qualifiers: bitfield of supported hwtstamp qualifier. * @get_drvinfo: Report driver/device information. Modern drivers no * longer have to implement this callback. Most fields are * correctly filled in by the core using system information, or * populated using other driver operations. * @get_regs_len: Get buffer length required for @get_regs * @get_regs: Get device registers * @get_wol: Report whether Wake-on-Lan is enabled * @set_wol: Turn Wake-on-Lan on or off. Returns a negative error code * or zero. * @get_msglevel: Report driver message level. This should be the value * of the @msg_enable field used by netif logging functions. * @set_msglevel: Set driver message level * @nway_reset: Restart autonegotiation. Returns a negative error code * or zero. * @get_link: Report whether physical link is up. Will only be called if * the netdev is up. Should usually be set to ethtool_op_get_link(), * which uses netif_carrier_ok(). * @get_link_ext_state: Report link extended state. Should set link_ext_state and * link_ext_substate (link_ext_substate of 0 means link_ext_substate is unknown, * do not attach ext_substate attribute to netlink message). If link_ext_state * and link_ext_substate are unknown, return -ENODATA. If not implemented, * link_ext_state and link_ext_substate will not be sent to userspace. * @get_link_ext_stats: Read extra link-related counters. * @get_eeprom_len: Read range of EEPROM addresses for validation of * @get_eeprom and @set_eeprom requests. * Returns 0 if device does not support EEPROM access. * @get_eeprom: Read data from the device EEPROM. * Should fill in the magic field. Don't need to check len for zero * or wraparound. Fill in the data argument with the eeprom values * from offset to offset + len. Update len to the amount read. * Returns an error or zero. * @set_eeprom: Write data to the device EEPROM. * Should validate the magic field. Don't need to check len for zero * or wraparound. Update len to the amount written. Returns an error * or zero. * @get_coalesce: Get interrupt coalescing parameters. Returns a negative * error code or zero. * @set_coalesce: Set interrupt coalescing parameters. Supported coalescing * types should be set in @supported_coalesce_params. * Returns a negative error code or zero. * @get_ringparam: Report ring sizes * @set_ringparam: Set ring sizes. Returns a negative error code or zero. * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. * @get_pauseparam: Report pause parameters * @set_pauseparam: Set pause parameters. Returns a negative error code * or zero. * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED * attached to it. The implementation may update the indicator * asynchronously or synchronously, but in either case it must return * quickly. It is initially called with the argument %ETHTOOL_ID_ACTIVE, * and must either activate asynchronous updates and return zero, return * a negative error or return a positive frequency for synchronous * indication (e.g. 1 for one on/off cycle per second). If it returns * a frequency then it will be called again at intervals with the * argument %ETHTOOL_ID_ON or %ETHTOOL_ID_OFF and should set the state of * the indicator accordingly. Finally, it is called with the argument * %ETHTOOL_ID_INACTIVE and must deactivate the indicator. Returns a * negative error code or zero. * @get_ethtool_stats: Return extended statistics about the device. * This is only useful if the device maintains statistics not * included in &struct rtnl_link_stats64. * @begin: Function to be called before any other operation. Returns a * negative error code or zero. * @complete: Function to be called after any other operation except * @begin. Will be called even if the other operation failed. * @get_priv_flags: Report driver-specific feature flags. * @set_priv_flags: Set driver-specific feature flags. Returns a negative * error code or zero. * @get_sset_count: Get number of strings that @get_strings will write. * @get_rxnfc: Get RX flow classification rules. Returns a negative * error code or zero. * @set_rxnfc: Set RX flow classification rules. Returns a negative * error code or zero. * @flash_device: Write a firmware image to device's flash memory. * Returns a negative error code or zero. * @reset: Reset (part of) the device, as specified by a bitmask of * flags from &enum ethtool_reset_flags. Returns a negative * error code or zero. * @get_rxfh_key_size: Get the size of the RX flow hash key. * Returns zero if not supported for this specific device. * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table. * Returns zero if not supported for this specific device. * @get_rxfh: Get the contents of the RX flow hash indirection table, hash key * and/or hash function. * Returns a negative error code or zero. * @set_rxfh: Set the contents of the RX flow hash indirection table, hash * key, and/or hash function. Arguments which are set to %NULL or zero * will remain unchanged. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. * @get_rxfh_fields: Get header fields used for flow hashing. * @set_rxfh_fields: Set header fields used for flow hashing. * @create_rxfh_context: Create a new RSS context with the specified RX flow * hash indirection table, hash key, and hash function. * The &struct ethtool_rxfh_context for this context is passed in @ctx; * note that the indir table, hkey and hfunc are not yet populated as * of this call. The driver does not need to update these; the core * will do so if this op succeeds. * However, if @rxfh.indir is set to %NULL, the driver must update the * indir table in @ctx with the (default or inherited) table actually in * use; similarly, if @rxfh.key is %NULL, @rxfh.hfunc is * %ETH_RSS_HASH_NO_CHANGE, or @rxfh.input_xfrm is %RXH_XFRM_NO_CHANGE, * the driver should update the corresponding information in @ctx. * If the driver provides this method, it must also provide * @modify_rxfh_context and @remove_rxfh_context. * Returns a negative error code or zero. * @modify_rxfh_context: Reconfigure the specified RSS context. Allows setting * the contents of the RX flow hash indirection table, hash key, and/or * hash function associated with the given context. * Parameters which are set to %NULL or zero will remain unchanged. * The &struct ethtool_rxfh_context for this context is passed in @ctx; * note that it will still contain the *old* settings. The driver does * not need to update these; the core will do so if this op succeeds. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. * @remove_rxfh_context: Remove the specified RSS context. * The &struct ethtool_rxfh_context for this context is passed in @ctx. * Returns a negative error code or zero. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. * @get_dump_flag: Get dump flag indicating current dump length, version, * and flag of the device. * @get_dump_data: Get dump data. * @set_dump: Set dump specific flags to the device. * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to * ethtool_op_get_ts_info(). * @get_ts_stats: Query the device hardware timestamping statistics. Drivers * must not zero statistics which they don't report. The stats structure * is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not * report statistics. * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module * @get_eee: Get Energy-Efficient (EEE) supported and status. * @set_eee: Set EEE status (enable/disable) as well as LPI timers. * @get_tunable: Read the value of a driver / device tunable. * @set_tunable: Set the value of a driver / device tunable. * @get_per_queue_coalesce: Get interrupt coalescing parameters per queue. * It must check that the given queue number is valid. If neither a RX nor * a TX queue has this number, return -EINVAL. If only a RX queue or a TX * queue has this number, set the inapplicable fields to ~0 and return 0. * Returns a negative error code or zero. * @set_per_queue_coalesce: Set interrupt coalescing parameters per queue. * It must check that the given queue number is valid. If neither a RX nor * a TX queue has this number, return -EINVAL. If only a RX queue or a TX * queue has this number, ignore the inapplicable fields. Supported * coalescing types should be set in @supported_coalesce_params. * Returns a negative error code or zero. * @get_link_ksettings: Get various device settings including Ethernet link * settings. The %cmd and %link_mode_masks_nwords fields should be * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), * any change to them will be overwritten by kernel. Returns a negative * error code or zero. * @set_link_ksettings: Set various device settings including Ethernet link * settings. The %cmd and %link_mode_masks_nwords fields should be * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), * any change to them will be overwritten by kernel. Returns a negative * error code or zero. * @get_fec_stats: Report FEC statistics. * Core will sum up per-lane stats to get the total. * Drivers must not zero statistics which they don't report. The stats * structure is initialized to ETHTOOL_STAT_NOT_SET indicating driver does * not report statistics. * @get_fecparam: Get the network device Forward Error Correction parameters. * @set_fecparam: Set the network device Forward Error Correction parameters. * @get_ethtool_phy_stats: Return extended statistics about the PHY device. * This is only useful if the device maintains PHY statistics and * cannot use the standard PHY library helpers. * @get_phy_tunable: Read the value of a PHY tunable. * @set_phy_tunable: Set the value of a PHY tunable. * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from * specified page. Returns a negative error code or the amount of bytes * read. * @set_module_eeprom_by_page: Write to a region of plug-in module EEPROM, * from kernel space only. Returns a negative error code or zero. * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics. * Set %ranges to a pointer to zero-terminated array of byte ranges. * @get_module_power_mode: Get the power mode policy for the plug-in module * used by the network device and its operational power mode, if * plugged-in. * @set_module_power_mode: Set the power mode policy for the plug-in module * used by the network device. * @get_mm: Query the 802.3 MAC Merge layer state. * @set_mm: Set the 802.3 MAC Merge layer parameters. * @get_mm_stats: Query the 802.3 MAC Merge layer statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must * hold the RTNL lock. * * See the structures used by these operations for further documentation. * Note that for all operations using a structure ending with a zero- * length array, the array is allocated separately in the kernel and * is passed to the driver as an additional parameter. * * See &struct net_device and &struct net_device_ops for documentation * of the generic netdev features interface. */ struct ethtool_ops { u32 supported_input_xfrm:8; u32 cap_link_lanes_supported:1; u32 rxfh_per_ctx_fields:1; u32 rxfh_per_ctx_key:1; u32 cap_rss_rxnfc_adds:1; u32 rxfh_indir_space; u16 rxfh_key_space; u16 rxfh_priv_size; u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; u32 supported_hwtstamp_qualifiers; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); void (*get_wol)(struct net_device *, struct ethtool_wolinfo *); int (*set_wol)(struct net_device *, struct ethtool_wolinfo *); u32 (*get_msglevel)(struct net_device *); void (*set_msglevel)(struct net_device *, u32); int (*nway_reset)(struct net_device *); u32 (*get_link)(struct net_device *); int (*get_link_ext_state)(struct net_device *, struct ethtool_link_ext_state_info *); void (*get_link_ext_stats)(struct net_device *dev, struct ethtool_link_ext_stats *stats); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); int (*set_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); int (*get_coalesce)(struct net_device *, struct ethtool_coalesce *, struct kernel_ethtool_coalesce *, struct netlink_ext_ack *); int (*set_coalesce)(struct net_device *, struct ethtool_coalesce *, struct kernel_ethtool_coalesce *, struct netlink_ext_ack *); void (*get_ringparam)(struct net_device *, struct ethtool_ringparam *, struct kernel_ethtool_ringparam *, struct netlink_ext_ack *); int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *, struct kernel_ethtool_ringparam *, struct netlink_ext_ack *); void (*get_pause_stats)(struct net_device *dev, struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, struct ethtool_pauseparam*); int (*set_pauseparam)(struct net_device *, struct ethtool_pauseparam*); void (*self_test)(struct net_device *, struct ethtool_test *, u64 *); void (*get_strings)(struct net_device *, u32 stringset, u8 *); int (*set_phys_id)(struct net_device *, enum ethtool_phys_id_state); void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *); int (*begin)(struct net_device *); void (*complete)(struct net_device *); u32 (*get_priv_flags)(struct net_device *); int (*set_priv_flags)(struct net_device *, u32); int (*get_sset_count)(struct net_device *, int); int (*get_rxnfc)(struct net_device *, struct ethtool_rxnfc *, u32 *rule_locs); int (*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *); int (*flash_device)(struct net_device *, struct ethtool_flash *); int (*reset)(struct net_device *, u32 *); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, struct netlink_ext_ack *extack); int (*get_rxfh_fields)(struct net_device *, struct ethtool_rxfh_fields *); int (*set_rxfh_fields)(struct net_device *, const struct ethtool_rxfh_fields *, struct netlink_ext_ack *extack); int (*create_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, const struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack); int (*modify_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, const struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack); int (*remove_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, u32 rss_context, struct netlink_ext_ack *extack); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); int (*get_dump_data)(struct net_device *, struct ethtool_dump *, void *); int (*set_dump)(struct net_device *, struct ethtool_dump *); int (*get_ts_info)(struct net_device *, struct kernel_ethtool_ts_info *); void (*get_ts_stats)(struct net_device *dev, struct ethtool_ts_stats *ts_stats); int (*get_module_info)(struct net_device *, struct ethtool_modinfo *); int (*get_module_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); int (*get_eee)(struct net_device *dev, struct ethtool_keee *eee); int (*set_eee)(struct net_device *dev, struct ethtool_keee *eee); int (*get_tunable)(struct net_device *, const struct ethtool_tunable *, void *); int (*set_tunable)(struct net_device *, const struct ethtool_tunable *, const void *); int (*get_per_queue_coalesce)(struct net_device *, u32, struct ethtool_coalesce *); int (*set_per_queue_coalesce)(struct net_device *, u32, struct ethtool_coalesce *); int (*get_link_ksettings)(struct net_device *, struct ethtool_link_ksettings *); int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); void (*get_fec_stats)(struct net_device *dev, struct ethtool_fec_stats *fec_stats); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, struct ethtool_fecparam *); void (*get_ethtool_phy_stats)(struct net_device *, struct ethtool_stats *, u64 *); int (*get_phy_tunable)(struct net_device *, const struct ethtool_tunable *, void *); int (*set_phy_tunable)(struct net_device *, const struct ethtool_tunable *, const void *); int (*get_module_eeprom_by_page)(struct net_device *dev, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack); int (*set_module_eeprom_by_page)(struct net_device *dev, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack); void (*get_eth_phy_stats)(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats); void (*get_rmon_stats)(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); int (*get_module_power_mode)(struct net_device *dev, struct ethtool_module_power_mode_params *params, struct netlink_ext_ack *extack); int (*set_module_power_mode)(struct net_device *dev, const struct ethtool_module_power_mode_params *params, struct netlink_ext_ack *extack); int (*get_mm)(struct net_device *dev, struct ethtool_mm_state *state); int (*set_mm)(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack); void (*get_mm_stats)(struct net_device *dev, struct ethtool_mm_stats *stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); struct ethtool_rx_flow_rule { struct flow_rule *rule; unsigned long priv[]; }; struct ethtool_rx_flow_spec_input { const struct ethtool_rx_flow_spec *fs; u32 rss_ctx; }; struct ethtool_rx_flow_rule * ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input); void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *rule); bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd); int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex); /** * struct ethtool_netdev_state - per-netdevice state for ethtool features * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; struct phy_device; struct phy_tdr_config; struct phy_plca_cfg; struct phy_plca_status; /** * struct ethtool_phy_ops - Optional PHY device options * @get_sset_count: Get number of strings that @get_strings will write. * @get_strings: Return a set of strings that describe the requested objects * @get_stats: Return extended statistics about the PHY device. * @get_plca_cfg: Return PLCA configuration. * @set_plca_cfg: Set PLCA configuration. * @get_plca_status: Get PLCA configuration. * @start_cable_test: Start a cable test * @start_cable_test_tdr: Start a Time Domain Reflectometry cable test * * All operations are optional (i.e. the function pointer may be set to %NULL) * and callers must take this into account. Callers must hold the RTNL lock. */ struct ethtool_phy_ops { int (*get_sset_count)(struct phy_device *dev); int (*get_strings)(struct phy_device *dev, u8 *data); int (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); int (*get_plca_cfg)(struct phy_device *dev, struct phy_plca_cfg *plca_cfg); int (*set_plca_cfg)(struct phy_device *dev, const struct phy_plca_cfg *plca_cfg, struct netlink_ext_ack *extack); int (*get_plca_status)(struct phy_device *dev, struct phy_plca_status *plca_st); int (*start_cable_test)(struct phy_device *phydev, struct netlink_ext_ack *extack); int (*start_cable_test_tdr)(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config); }; /** * ethtool_set_ethtool_phy_ops - Set the ethtool_phy_ops singleton * @ops: Ethtool PHY operations to set */ void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops); /** * ethtool_params_from_link_mode - Derive link parameters from a given link mode * @link_ksettings: Link parameters to be derived from the link mode * @link_mode: Link mode */ void ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, enum ethtool_link_mode_bit_indices link_mode); /** * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller * is responsible to free memory of vclock_index * @dev: pointer to net_device structure * @vclock_index: pointer to pointer of vclock index * * Return: number of phc vclocks */ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); /* Some generic methods drivers may use in their ethtool_ops */ u32 ethtool_op_get_link(struct net_device *dev); int ethtool_op_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *eti); /** * ethtool_mm_frag_size_add_to_min - Translate (standard) additional fragment * size expressed as multiplier into (absolute) minimum fragment size * value expressed in octets * @val_add: Value of addFragSize multiplier */ static inline u32 ethtool_mm_frag_size_add_to_min(u32 val_add) { return (ETH_ZLEN + ETH_FCS_LEN) * (1 + val_add) - ETH_FCS_LEN; } /** * ethtool_mm_frag_size_min_to_add - Translate (absolute) minimum fragment size * expressed in octets into (standard) additional fragment size expressed * as multiplier * @val_min: Value of addFragSize variable in octets * @val_add: Pointer where the standard addFragSize value is to be returned * @extack: Netlink extended ack * * Translate a value in octets to one of 0, 1, 2, 3 according to the reverse * application of the 802.3 formula 64 * (1 + addFragSize) - 4. To be called * by drivers which do not support programming the minimum fragment size to a * continuous range. Returns error on other fragment length values. */ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, struct netlink_ext_ack *extack) { u32 add_frag_size; for (add_frag_size = 0; add_frag_size < 4; add_frag_size++) { if (ethtool_mm_frag_size_add_to_min(add_frag_size) == val_min) { *val_add = add_frag_size; return 0; } } NL_SET_ERR_MSG_MOD(extack, "minFragSize required to be one of 60, 124, 188 or 252"); return -EINVAL; } /** * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. * @dev: pointer to net_device structure * @info: buffer to hold the result * Returns: zero on success, non-zero otherwise. */ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info); /** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to a pointer to the start of string to update * @fmt: Format of string to write * * Write formatted string to *data. Update *data to point at start of * next string. */ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); /** * ethtool_puts - Write string to ethtool string data * @data: Pointer to a pointer to the start of string to update * @str: String to write * * Write string to *data without a trailing newline. Update *data * to point at start of next string. * * Prefer this function to ethtool_sprintf() when given only * two arguments or if @fmt is just "%s". */ extern void ethtool_puts(u8 **data, const char *str); /** * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data * @data: Pointer to a pointer to the start of string to write into * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from */ #define ethtool_cpy(data, str) do { \ BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN); \ memcpy(*(data), str, ETH_GSTRING_LEN); \ *(data) += ETH_GSTRING_LEN; \ } while (0) /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; __ETHTOOL_DECLARE_LINK_MODE_MASK(caps); const u32 *cap_arr; u32 arr_size; }; #define ETHTOOL_FORCED_SPEED_MAP(prefix, value) \ { \ .speed = SPEED_##value, \ .cap_arr = prefix##_##value, \ .arr_size = ARRAY_SIZE(prefix##_##value), \ } void ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size); #endif /* _LINUX_ETHTOOL_H */ |
| 93 66 1948 1939 1939 5 5 23 17 354 101 65 10 15 15 14 259 104 53 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"}, \ {I_LINKABLE, "I_LINKABLE"}, \ {I_WB_SWITCH, "I_WB_SWITCH"}, \ {I_OVL_INUSE, "I_OVL_INUSE"}, \ {I_CREATING, "I_CREATING"}, \ {I_DONTCACHE, "I_DONTCACHE"}, \ {I_SYNC_QUEUED, "I_SYNC_QUEUED"}, \ {I_PINNING_NETFS_WB, "I_PINNING_NETFS_WB"}, \ {I_LRU_ISOLATING, "I_LRU_ISOLATING"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_folio_template, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct folio *folio, struct bdi_writeback *wb), TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d bgrd=%d " "cyclic=%d start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, struct dirty_throttle_control *dtc, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, dtc, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, wb_setpoint) __field(unsigned long, wb_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = dtc->limit; __entry->setpoint = (dtc->limit + freerun) / 2; __entry->dirty = dtc->dirty; __entry->wb_setpoint = __entry->setpoint * dtc->wb_thresh / (dtc->thresh + 1); __entry->wb_dirty = dtc->wb_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "wb_setpoint=%lu wb_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->wb_setpoint, __entry->wb_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 351 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking helpers. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_helper.h */ #ifndef _NF_CONNTRACK_HELPER_H #define _NF_CONNTRACK_HELPER_H #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_expect.h> #define NF_NAT_HELPER_PREFIX "ip_nat_" #define NF_NAT_HELPER_NAME(name) NF_NAT_HELPER_PREFIX name #define MODULE_ALIAS_NF_NAT_HELPER(name) \ MODULE_ALIAS(NF_NAT_HELPER_NAME(name)) struct module; enum nf_ct_helper_flags { NF_CT_HELPER_F_USERSPACE = (1 << 0), NF_CT_HELPER_F_CONFIGURED = (1 << 1), }; #define NF_CT_HELPER_NAME_LEN 16 struct nf_conntrack_helper { struct hlist_node hnode; /* Internal use. */ char name[NF_CT_HELPER_NAME_LEN]; /* name of the module */ refcount_t refcnt; struct module *me; /* pointer to self */ const struct nf_conntrack_expect_policy *expect_policy; /* Tuple of things we will help (compared against server response) */ struct nf_conntrack_tuple tuple; /* Function to call when data passes; return verdict, or -1 to invalidate. */ int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info conntrackinfo); void (*destroy)(struct nf_conn *ct); int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct); int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct); unsigned int expect_class_max; unsigned int flags; /* For user-space helpers: */ unsigned int queue_num; /* length of userspace private data stored in nf_conn_help->data */ u16 data_len; /* name of NAT helper module */ char nat_mod_name[NF_CT_HELPER_NAME_LEN]; }; /* Must be kept in sync with the classes defined by helpers */ #define NF_CT_MAX_EXPECT_CLASSES 4 /* nf_conn feature for connections that have a helper */ struct nf_conn_help { /* Helper. if any */ struct nf_conntrack_helper __rcu *helper; struct hlist_head expectations; /* Current number of expected connections */ u8 expecting[NF_CT_MAX_EXPECT_CLASSES]; /* private helper information. */ char data[32] __aligned(8); }; #define NF_CT_HELPER_BUILD_BUG_ON(structsize) \ BUILD_BUG_ON((structsize) > sizeof_field(struct nf_conn_help, data)) struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum); struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper); void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct), struct module *module); int nf_conntrack_helper_register(struct nf_conntrack_helper *); void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *, unsigned int); struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u16 proto); int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, u8 proto, bool nat, struct nf_conntrack_helper **hp); void nf_ct_helper_destroy(struct nf_conn *ct); static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) { return nf_ct_ext_find(ct, NF_CT_EXT_HELPER); } static inline void *nfct_help_data(const struct nf_conn *ct) { struct nf_conn_help *help; help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); return (void *)help->data; } int nf_conntrack_helper_init(void); void nf_conntrack_helper_fini(void); int nf_conntrack_broadcast_help(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int timeout); struct nf_ct_helper_expectfn { struct list_head head; const char *name; void (*expectfn)(struct nf_conn *ct, struct nf_conntrack_expect *exp); }; __printf(3,4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n); struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_name(const char *name); struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_symbol(const void *symbol); extern struct hlist_head *nf_ct_helper_hash; extern unsigned int nf_ct_helper_hsize; struct nf_conntrack_nat_helper { struct list_head list; char mod_name[NF_CT_HELPER_NAME_LEN]; /* module name */ struct module *module; /* pointer to self */ }; #define NF_CT_NAT_HELPER_INIT(name) \ { \ .mod_name = NF_NAT_HELPER_NAME(name), \ .module = THIS_MODULE \ } void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat); void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_nat_helper_put(struct nf_conntrack_helper *helper); #endif /*_NF_CONNTRACK_HELPER_H*/ |
| 12 65 65 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2009-2021 Christoph Hellwig * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. * * Current conventions for printing numbers measuring specific units: * * offset: byte offset into a subcomponent of a file operation * pos: file offset, in bytes * length: length of a file operation, in bytes * ino: inode number * * Numbers describing space allocations should be formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap #if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _IOMAP_TRACE_H #include <linux/tracepoint.h> struct inode; DECLARE_EVENT_CLASS(iomap_readpage_class, TP_PROTO(struct inode *inode, int nr_pages), TP_ARGS(inode, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(int, nr_pages) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_pages = nr_pages; ), TP_printk("dev %d:%d ino 0x%llx nr_pages %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->nr_pages) ) #define DEFINE_READPAGE_EVENT(name) \ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, loff_t off, u64 len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) __field(loff_t, offset) __field(u64, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->size = i_size_read(inode); __entry->offset = off; __entry->length = len; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->length) ) #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ TP_PROTO(struct inode *inode, loff_t off, u64 len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writeback_folio); DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); DEFINE_RANGE_EVENT(iomap_dio_rw_queued); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ { IOMAP_DELALLOC, "DELALLOC" }, \ { IOMAP_MAPPED, "MAPPED" }, \ { IOMAP_UNWRITTEN, "UNWRITTEN" }, \ { IOMAP_INLINE, "INLINE" } #define IOMAP_FLAGS_STRINGS \ { IOMAP_WRITE, "WRITE" }, \ { IOMAP_ZERO, "ZERO" }, \ { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" }, \ { IOMAP_OVERWRITE_ONLY, "OVERWRITE_ONLY" }, \ { IOMAP_UNSHARE, "UNSHARE" }, \ { IOMAP_DAX, "DAX" }, \ { IOMAP_ATOMIC, "ATOMIC" }, \ { IOMAP_DONTCACHE, "DONTCACHE" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ { IOMAP_F_DIRTY, "DIRTY" }, \ { IOMAP_F_SHARED, "SHARED" }, \ { IOMAP_F_MERGED, "MERGED" }, \ { IOMAP_F_BUFFER_HEAD, "BH" }, \ { IOMAP_F_XATTR, "XATTR" }, \ { IOMAP_F_BOUNDARY, "BOUNDARY" }, \ { IOMAP_F_ANON_WRITE, "ANON_WRITE" }, \ { IOMAP_F_ATOMIC_BIO, "ATOMIC_BIO" }, \ { IOMAP_F_PRIVATE, "PRIVATE" }, \ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }, \ { IOMAP_F_STALE, "STALE" } #define IOMAP_DIO_STRINGS \ {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), TP_ARGS(inode, iomap), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(u64, addr) __field(loff_t, offset) __field(u64, length) __field(u16, type) __field(u16, flags) __field(dev_t, bdev) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->addr = iomap->addr; __entry->offset = iomap->offset; __entry->length = iomap->length; __entry->type = iomap->type; __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " "length 0x%llx type %s (0x%x) flags %s (0x%x)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), __entry->addr, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), __entry->type, __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS), __entry->flags) ) #define DEFINE_IOMAP_EVENT(name) \ DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); TRACE_EVENT(iomap_add_to_ioend, TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len, struct iomap *iomap), TP_ARGS(inode, pos, dirty_len, iomap), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(u64, pos) __field(u64, dirty_len) __field(u64, addr) __field(loff_t, offset) __field(u64, length) __field(u16, type) __field(u16, flags) __field(dev_t, bdev) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->dirty_len = dirty_len; __entry->addr = iomap->addr; __entry->offset = iomap->offset; __entry->length = iomap->length; __entry->type = iomap->type; __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " "addr 0x%llx offset 0x%llx length 0x%llx type %s (0x%x) flags %s (0x%x)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), __entry->pos, __entry->dirty_len, __entry->addr, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), __entry->type, __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS), __entry->flags) ); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, unsigned long caller), TP_ARGS(iter, ops, caller), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) __field(u64, length) __field(int, status) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) ), TP_fast_assign( __entry->dev = iter->inode->i_sb->s_dev; __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); __entry->status = iter->status; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, __entry->status, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, (void *)__entry->caller) ); TRACE_EVENT(iomap_dio_rw_begin, TP_PROTO(struct kiocb *iocb, struct iov_iter *iter, unsigned int dio_flags, size_t done_before), TP_ARGS(iocb, iter, dio_flags, done_before), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, isize) __field(loff_t, pos) __field(size_t, count) __field(size_t, done_before) __field(int, ki_flags) __field(unsigned int, dio_flags) __field(bool, aio) ), TP_fast_assign( __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; __entry->ino = file_inode(iocb->ki_filp)->i_ino; __entry->isize = file_inode(iocb->ki_filp)->i_size; __entry->pos = iocb->ki_pos; __entry->count = iov_iter_count(iter); __entry->done_before = done_before; __entry->ki_flags = iocb->ki_flags; __entry->dio_flags = dio_flags; __entry->aio = !is_sync_kiocb(iocb); ), TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->pos, __entry->count, __entry->done_before, __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS), __entry->aio) ); TRACE_EVENT(iomap_dio_complete, TP_PROTO(struct kiocb *iocb, int error, ssize_t ret), TP_ARGS(iocb, error, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, isize) __field(loff_t, pos) __field(int, ki_flags) __field(bool, aio) __field(int, error) __field(ssize_t, ret) ), TP_fast_assign( __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; __entry->ino = file_inode(iocb->ki_filp)->i_ino; __entry->isize = file_inode(iocb->ki_filp)->i_size; __entry->pos = iocb->ki_pos; __entry->ki_flags = iocb->ki_flags; __entry->aio = !is_sync_kiocb(iocb); __entry->error = error; __entry->ret = ret; ), TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->pos, __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), __entry->aio, __entry->error, __entry->ret) ); #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
|
25
17
8
20
2
2
2
2
8
8
7
7
6
4
6
3
7
11
10
9
8
2
3
6
6
9
3
8
1
7
1
6
1
5
1
4
4
1
3
18
6
6
8
12
12
|