Total coverage: 63971 (5%)of 1585120
3 3 3 3 4 4 4 4 4 4 4 4 3 3 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 3 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/namei.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 * Hash Tree Directory indexing (c) * Daniel Phillips, 2001 * Hash Tree Directory indexing porting * Christopher Li, 2002 * Hash Tree Directory indexing cleanup * Theodore Ts'o, 2002 */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/time.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * define how far ahead to read directories while searching them. */ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, ext4_lblk_t *block) { struct ext4_map_blocks map; struct buffer_head *bh; int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; map.m_lblk = *block; map.m_len = 1; /* * We're appending new directory block. Make sure the block is not * allocated yet, otherwise we will end up corrupting the * directory. */ err = ext4_map_blocks(NULL, inode, &map, 0); if (err < 0) return ERR_PTR(err); if (err) { EXT4_ERROR_INODE(inode, "Logical block already allocated"); return ERR_PTR(-EFSCORRUPTED); } bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; return bh; out: brelse(bh); ext4_std_error(inode->i_sb, err); return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent); /* * Hints to ext4_read_dirblock regarding whether we expect a directory * block being read to be an index block, or a block containing * directory entries (and if the latter, whether it was found via a * logical block in an htree index block). This is used to control * what sort of sanity checkinig ext4_read_dirblock() will do on the * directory block read from the storage device. EITHER will means * the caller doesn't know what kind of directory block will be read, * so no specific verification will be done. */ typedef enum { EITHER, INDEX, DIRENT, DIRENT_HTREE } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_lblk_t block, dirblock_type_t type, const char *func, unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; int is_dx_block = 0; if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); return ERR_PTR(-EFSCORRUPTED); } if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) bh = ERR_PTR(-EIO); else bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " "error %ld reading directory block", inode->i_ino, (unsigned long)block, current->comm, PTR_ERR(bh)); return bh; } /* The first directory block must not be a hole. */ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { ext4_error_inode(inode, func, line, block, "Directory hole found for htree %s block %u", (type == INDEX) ? "index" : "leaf", block); return ERR_PTR(-EFSCORRUPTED); } if (!bh) return NULL; dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ if (is_dx(inode)) { if (block == 0) is_dx_block = 1; else if (ext4_rec_len_from_disk(dirent->rec_len, inode->i_sb->s_blocksize) == inode->i_sb->s_blocksize) is_dx_block = 1; } if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); brelse(bh); return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; /* * An empty leaf block can get mistaken for a index block; for * this reason, we can only check the index checksum when the * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { if (ext4_dx_csum_verify(inode, dirent) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } if (!is_dx_block) { if (ext4_dirblock_csum_verify(inode, bh) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } return bh; } #ifdef DX_DEBUG #define dxtrace(command) command #else #define dxtrace(command) #endif struct fake_dirent { __le32 inode; __le16 rec_len; u8 name_len; u8 file_type; }; struct dx_countlimit { __le16 limit; __le16 count; }; struct dx_entry { __le32 hash; __le32 block; }; /* * dx_root_info is laid out so that if it should somehow get overlaid by a * dirent the two low bits of the hash version will be zero. Therefore, the * hash version mod 4 should never be 0. Sincerely, the paranoia department. */ struct dx_root { struct fake_dirent dot; char dot_name[4]; struct fake_dirent dotdot; char dotdot_name[4]; struct dx_root_info { __le32 reserved_zero; u8 hash_version; u8 info_length; /* 8 */ u8 indirect_levels; u8 unused_flags; } info; struct dx_entry entries[]; }; struct dx_node { struct fake_dirent fake; struct dx_entry entries[]; }; struct dx_frame { struct buffer_head *bh; struct dx_entry *entries; struct dx_entry *at; }; struct dx_map_entry { u32 hash; u16 offs; u16 size; }; /* * This goes at the end of each htree block. */ struct dx_tail { u32 dt_reserved; __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); static inline unsigned dx_get_hash(struct dx_entry *entry); static void dx_set_hash(struct dx_entry *entry, unsigned value); static unsigned dx_get_count(struct dx_entry *entries); static unsigned dx_get_limit(struct dx_entry *entries); static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *offsets, int count, unsigned int blocksize); static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); /* checksumming functions */ void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize) { struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( sizeof(struct ext4_dir_entry_tail), blocksize); t->det_reserved_ft = EXT4_FT_DIR_CSUM; } /* Walk through a dirent block to find a checksum "dirent" at the tail */ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + (blocksize - sizeof(struct ext4_dir_entry_tail))); while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; t = (struct ext4_dir_entry_tail *)d; #else t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb)); #endif if (t->det_reserved_zero1 || (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; return t; } static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } #define warn_no_space_for_csum(inode) \ __warn_no_space_for_csum((inode), __func__, __LINE__) static void __warn_no_space_for_csum(struct inode *inode, const char *func, unsigned int line) { __ext4_warning_inode(inode, func, line, "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return 0; } if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data)) return 0; return 1; } static void ext4_dirblock_csum_set(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return; } t->det_checksum = ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data); } int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dirblock_csum_set(inode, bh); return ext4_handle_dirty_metadata(handle, inode, bh); } static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dirent, int *offset) { struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); if (rlen == blocksize) count_offset = 8; else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || root->info_length != sizeof(struct dx_root_info)) return NULL; count_offset = 32; } else return NULL; if (offset) *offset = count_offset; return (struct dx_countlimit *)(((void *)dirent) + count_offset); } static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; __u32 dummy_csum = 0; int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return 0; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return 0; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, count, t)) return 0; return 1; } static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); } static inline int ext4_handle_dirty_dx_node(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); } /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) { return (struct ext4_dir_entry_2 *)((char *)p + ext4_rec_len_from_disk(p->rec_len, blocksize)); } /* * Future: use high four bits of block for coalesce-on-delete flags * Mask them off for now. */ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) { entry->block = cpu_to_le32(value); } static inline unsigned dx_get_hash(struct dx_entry *entry) { return le32_to_cpu(entry->hash); } static inline void dx_set_hash(struct dx_entry *entry, unsigned value) { entry->hash = cpu_to_le32(value); } static inline unsigned dx_get_count(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->count); } static inline unsigned dx_get_limit(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->limit); } static inline void dx_set_count(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); } static inline void dx_set_limit(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } /* * Debug */ #ifdef DX_DEBUG static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { printk(KERN_CONT " %x->%lu", i ? dx_get_hash(entries + i) : 0, (unsigned long)dx_get_block(entries + i)); } printk(KERN_CONT "\n"); } struct stats { unsigned names; unsigned space; unsigned bcount; }; static struct stats dx_show_leaf(struct inode *dir, struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; printk("names: "); while ((char *) de < base + size) { if (de->inode) { if (show_names) { #ifdef CONFIG_FS_ENCRYPTION int len; char *name; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0); int res = 0; name = de->name; len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); } else { struct fscrypt_str de_name = FSTR_INIT(name, len); /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " "crypto\n"); res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" "\n"); name = "??"; len = 2; } else { name = fname_crypto_str.name; len = fname_crypto_str.len; } if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); fscrypt_fname_free_buffer( &fname_crypto_str); } #else int len = de->name_len; char *name = de->name; (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif } space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); } printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_entry *entries, int levels) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { ext4_lblk_t block = dx_get_block(entries); ext4_lblk_t hash = i ? dx_get_hash(entries): 0; u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); bh = ext4_bread(NULL,dir, block, 0); if (!bh || IS_ERR(bh)) continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; brelse(bh); } if (bcount) printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", levels ? "" : " ", names, space/bcount, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } /* * Linear search cross check */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { while (n--) { dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; break; } } ASSERT(at == target - 1); } #else /* DX_DEBUG */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { } #endif /* DX_DEBUG */ /* * Probe for a directory leaf block to search. * * dx_probe can return ERR_BAD_DX_DIR, which means there was a format * error in the directory index, and the caller should fall back to * searching the directory normally. The callers of dx_probe **MUST** * check for this error code, and make sure it never gets reflected * back to userspace. */ static struct dx_frame * dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect, level, i; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; struct dx_frame *frame = frame_in; struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; ext4_lblk_t block; ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY && root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } if (ext4_hash_in_dirent(dir)) { if (root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash in dirent, but hash is not SIPHASH"); goto fail; } } else { if (root->info.hash_version == DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash code is SIPHASH, but hash not in dirent"); goto fail; } } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { int ret = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); if (ret < 0) { ret_err = ERR_PTR(ret); goto fail; } } hash = hinfo->hash; if (root->info.unused_flags & 1) { ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", root->info.unused_flags); goto fail; } indirect = root->info.indirect_levels; if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ext4_warning(dir->i_sb, "Directory (ino: %lu) htree depth %#06x exceed" "supported value", dir->i_ino, ext4_dir_htree_level(dir->i_sb)); if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ext4_warning(dir->i_sb, "Enable large directory " "feature to access it"); } goto fail; } entries = (struct dx_entry *)(((char *)&root->info) + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", dx_get_limit(entries), dx_root_limit(dir, root->info.info_length)); goto fail; } dxtrace(printk("Look up %x", hash)); level = 0; blocks[0] = 0; while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning_inode(dir, "dx entry: count %u beyond limit %u", count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else p = m + 1; } htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; block = dx_get_block(at); for (i = 0; i <= level; i++) { if (blocks[i] == block) { ext4_warning_inode(dir, "dx entry: tree cycle block %u points back to block %u", blocks[level], block); goto fail; } } if (++level > indirect) return frame; blocks[level] = block; frame++; frame->bh = ext4_read_dirblock(dir, block, INDEX); if (IS_ERR(frame->bh)) { ret_err = (struct dx_frame *) frame->bh; frame->bh = NULL; goto fail; } entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit(dir)) { ext4_warning_inode(dir, "dx entry: limit %u != node limit %u", dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning_inode(dir, "Corrupt directory, running e2fsck is recommended"); return ret_err; } static void dx_release(struct dx_frame *frames) { struct dx_root_info *info; int i; unsigned int indirect_levels; if (frames[0].bh == NULL) return; info = &((struct dx_root *)frames[0].bh->b_data)->info; /* save local copy, "info" may be freed after brelse() */ indirect_levels = info->indirect_levels; for (i = 0; i <= indirect_levels; i++) { if (frames[i].bh == NULL) break; brelse(frames[i].bh); frames[i].bh = NULL; } } /* * This function increments the frame pointer to search the next leaf * block, and reads in the necessary intervening nodes if the search * should be necessary. Whether or not the search is necessary is * controlled by the hash parameter. If the hash value is even, then * the search is only continued if the next block starts with that * hash value. This is used if we are searching for a specific file. * * If the hash value is HASH_NB_ALWAYS, then always go to the next block. * * This function returns 1 if the caller should continue to search, * or 0 if it should not. If there is an error reading one of the * index blocks, it will a negative error code. * * If start_hash is non-null, it will be filled in with the starting * hash of the next page. */ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash) { struct dx_frame *p; struct buffer_head *bh; int num_frames = 0; __u32 bhash; p = frame; /* * Find the next leaf page by incrementing the frame pointer. * If we run out of entries in the interior node, loop around and * increment pointer in the parent node. When we break out of * this loop, num_frames indicates the number of interior * nodes need to be read. */ while (1) { if (++(p->at) < p->entries + dx_get_count(p->entries)) break; if (p == frames) return 0; num_frames++; p--; } /* * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); if (start_hash) *start_hash = bhash; if ((hash & 1) == 0) { if ((bhash & ~1) != hash) return 0; } /* * If the hash is HASH_NB_ALWAYS, we always go to the next * block so no check is necessary */ while (num_frames--) { bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); if (IS_ERR(bh)) return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } return 1; } /* * This function fills a red-black tree with information from a * directory block. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ static int htree_dirblock_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash) { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; int csum = ext4_has_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ext4_dir_rec_len(0, csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); if (err < 0) { brelse(bh); return err; } err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { brelse(bh); return err; } } for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; } if (ext4_hash_in_dirent(dir)) { if (de->name_len && de->inode) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hinfo->hash = 0; hinfo->minor_hash = 0; } } else { err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if (err < 0) { count = err; goto errout; } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; if (!IS_ENCRYPTED(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { int save_len = fname_crypto_str.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); if (err) { count = err; goto errout; } err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); fname_crypto_str.len = save_len; } if (err != 0) { count = err; goto errout; } count++; } errout: brelse(bh); fscrypt_fname_free_buffer(&fname_crypto_str); return count; } /* * This function fills a red-black tree with information from a * directory. We start scanning the directory in hash order, starting * at start_hash and start_minor_hash. * * This function returns the number of entries inserted into the tree, * or a negative error code. */ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash) { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; int ret, err; __u32 hashval; struct fscrypt_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { if (ext4_hash_in_dirent(dir)) hinfo.hash_version = DX_HASH_SIPHASH; else hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; count = ext4_inlinedir_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash, &has_inline_data); if (has_inline_data) { *next_hash = ~0; return count; } } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; return count; } hinfo.hash = start_hash; hinfo.minor_hash = 0; frame = dx_probe(NULL, dir, &hinfo, frames); if (IS_ERR(frame)) return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 0, 0, de, &tmp_str); if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 2, 0, de, &tmp_str); if (err != 0) goto errout; count++; } while (1) { if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); if (ret < 0) { err = ret; goto errout; } count += ret; hashval = ~0; ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, frame, frames, &hashval); *next_hash = hashval; if (ret < 0) { err = ret; goto errout; } /* * Stop if: (a) there are no more entries, or * (b) we have inserted at least one entry and the * next hash value is not a continuation */ if ((ret == 0) || (count && ((hashval & 1) == 0))) break; } dx_release(frames); dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " "next hash: %x\n", count, *next_hash)); return count; errout: dx_release(frames); return (err); } static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, fname, offset, res_dir); } /* * Directory block splitting, compacting */ /* * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, ((char *)de) - base)) return -EFSCORRUPTED; if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); else { int err = ext4fs_dirhash(dir, de->name, de->name_len, &h); if (err < 0) return err; } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; map_tail->size = ext4_rec_len_from_disk(de->rec_len, blocksize); count++; cond_resched(); } de = ext4_next_entry(de, blocksize); } return count; } /* Sort map by hash value */ static void dx_sort_map (struct dx_map_entry *map, unsigned count) { struct dx_map_entry *p, *q, *top = map + count - 1; int more; /* Combsort until bubble sort doesn't suck */ while (count > 2) { count = count*10/13; if (count - 9 < 2) /* 9, 10 -> 11 */ count = 11; for (p = top, q = p - count; q >= map; p--, q--) if (p->hash < q->hash) swap(*p, *q); } /* Garden variety bubble sort */ do { more = 0; q = top; while (q-- > map) { if (q[1].hash >= q[0].hash) continue; swap(*(q+1), *q); more = 1; } } while(more); } static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) { struct dx_entry *entries = frame->entries; struct dx_entry *old = frame->at, *new = old + 1; int count = dx_get_count(entries); ASSERT(count < dx_get_limit(entries)); ASSERT(old < entries + count); memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); dx_set_hash(new, hash); dx_set_block(new, block); dx_set_count(entries, count + 1); } #ifdef CONFIG_UNICODE /* * Test whether a case-insensitive directory entry matches the filename * being searched for. If quick is set, assume the name being looked up * is already in the casefolded form. * * Returns: 0 if the directory entry matches, more than 0 if it * doesn't match or less than zero on error. */ static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, u8 *de_name, size_t de_name_len, bool quick) { const struct super_block *sb = parent->i_sb; const struct unicode_map *um = sb->s_encoding; struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); struct qstr entry = QSTR_INIT(de_name, de_name_len); int ret; if (IS_ENCRYPTED(parent)) { const struct fscrypt_str encrypted_name = FSTR_INIT(de_name, de_name_len); decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); if (!decrypted_name.name) return -ENOMEM; ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, &decrypted_name); if (ret < 0) goto out; entry.name = decrypted_name.name; entry.len = decrypted_name.len; } if (quick) ret = utf8_strncasecmp_folded(um, name, &entry); else ret = utf8_strncasecmp(um, name, &entry); if (ret < 0) { /* Handle invalid character sequence as either an error * or as an opaque byte sequence. */ if (sb_has_strict_encoding(sb)) ret = -EINVAL; else if (name->len != entry.len) ret = 1; else ret = !!memcmp(name->name, entry.name, entry.len); } out: kfree(decrypted_name.name); return ret; } int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { struct fscrypt_str *cf_name = &name->cf_name; struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; } cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!cf_name->name) return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, cf_name->name, EXT4_NAME_LEN); if (len <= 0) { kfree(cf_name->name); cf_name->name = NULL; } cf_name->len = (unsigned) len; if (!IS_ENCRYPTED(dir)) return 0; hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif /* * Test whether a directory entry matches the filename being searched for. * * Return: %true if the directory entry matches, otherwise %false. */ static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { struct fscrypt_name f; if (!de->inode) return false; f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif #ifdef CONFIG_UNICODE if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; if (IS_ENCRYPTED(parent)) { if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)) { return 0; } } return !ext4_ci_compare(parent, &cf, de->name, de->name_len, true); } return !ext4_ci_compare(parent, fname->usr_fname, de->name, de->name_len, false); } #endif return fscrypt_match_name(&f, de->name, de->name_len); } /* * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, buf_size, offset)) return -EFSCORRUPTED; *res_dir = de; return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -EFSCORRUPTED; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, struct ext4_dir_entry *de) { struct super_block *sb = dir->i_sb; if (!is_dx(dir)) return 0; if (block == 0) return 1; if (de->inode == 0 && ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == sb->s_blocksize) return 1; return 0; } /* * __ext4_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the cache buffer in which the entry was found, and the entry * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ static struct buffer_head *__ext4_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block; const u8 *name = fname->usr_fname->name; size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ size_t ra_ptr = 0; /* Current index into readahead buffer */ ext4_lblk_t nblocks; int i, namelen, retval; *res_dir = NULL; sb = dir->i_sb; namelen = fname->usr_fname->len; if (namelen > EXT4_NAME_LEN) return NULL; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; ret = ext4_find_inline_entry(dir, fname, res_dir, &has_inline_data); if (inlined) *inlined = has_inline_data; if (has_inline_data) goto cleanup_and_exit; } if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS */ block = start = 0; nblocks = 1; goto restart; } if (is_dx(dir)) { ret = ext4_dx_find_entry(dir, fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (!nblocks) { ret = NULL; goto cleanup_and_exit; } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; block = start; restart: do { /* * We deal with the read-ahead logic here. */ cond_resched(); if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; if (block < start) ra_max = start - block; else ra_max = nblocks - block; ra_max = min(ra_max, ARRAY_SIZE(bh_use)); retval = ext4_bread_batch(dir, block, ra_max, false /* wait */, bh_use); if (retval) { ret = ERR_PTR(retval); ra_max = 0; goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_ERR(dir, EIO, "reading directory lblock %lu", (unsigned long) block); brelse(bh); ret = ERR_PTR(-EIO); goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); ret = ERR_PTR(-EFSBADCRC); goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; goto cleanup_and_exit; } else { brelse(bh); if (i < 0) { ret = ERR_PTR(i); goto cleanup_and_exit; } } next: if (++block >= nblocks) block = 0; } while (block != start); /* * If the directory has grown while we were searching, then * search the last part of the directory before giving up. */ block = nblocks; nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (block < nblocks) { start = 0; goto restart; } cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); return ret; } static struct buffer_head *ext4_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *inlined) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_setup_filename(dir, d_name, 1, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct dentry *dentry, struct ext4_dir_entry_2 **res_dir) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; #ifdef CONFIG_FS_ENCRYPTION *res_dir = NULL; #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) goto errout; retval = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) goto success; brelse(bh); if (retval < 0) { bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning_inode(dir, "error %d reading directory index block", retval); bh = ERR_PTR(retval); goto errout; } } while (retval == 1); bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct ext4_dir_entry_2 *de; struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); bh = ext4_lookup_entry(dir, dentry, &de); if (IS_ERR(bh)) return ERR_CAST(bh); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (unlikely(ino == dir->i_ino)) { EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", dentry); return ERR_PTR(-EFSCORRUPTED); } inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } } #ifdef CONFIG_UNICODE if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry * from being cached. */ return NULL; } #endif return d_splice_alias(inode, dentry); } struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; struct ext4_dir_entry_2 * de; struct buffer_head *bh; bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(child->d_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); rec_len = ext4_dir_rec_len(de->name_len, dir); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); map++; to += rec_len; } return (struct ext4_dir_entry_2 *) (to - rec_len); } /* * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } de = next; } return prev; } /* * Split a full leaf block to make room for a new dir entry. * Allocate a new block, and move entries so that they are approx. equally full. * Returns pointer to de in block into which the new entry will be inserted. */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned continued; int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; int csum_size = 0; int err = 0, i; if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; return (struct ext4_dir_entry_2 *) bh2; } BUFFER_TRACE(*bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; data2 = bh2->b_data; /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); count = dx_make_map(dir, *bh, hinfo, map); if (count < 0) { err = count; goto journal_error; } map -= count; dx_sort_map(map, count); /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { /* is more than half of this entry in 2nd half of the block? */ if (size + map[i].size/2 > blocksize/2) break; size += map[i].size; move++; } /* * map index at which we will split * * If the sum of active entries didn't exceed half the block size, just * split it in half by count; each resulting block will have at least * half the space free. */ if (i > 0) split = count - move; else split = count/2; hash2 = map[split].hash; continued = split > 0 ? hash2 == map[split - 1].hash : 0; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(dir, data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, blocksize); de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de2, blocksize); if (csum_size) { ext4_initialize_dirent_tail(*bh, blocksize); ext4_initialize_dirent_tail(bh2, blocksize); } dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); /* Which block gets the new entry? */ if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; } dx_insert_block(frame, hash2 + continued, newblock); err = ext4_handle_dirty_dirblock(handle, dir, bh2); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); dxtrace(dx_show_index("frame", frame->entries)); return de; journal_error: brelse(*bh); brelse(bh2); *bh = NULL; ext4_std_error(dir->i_sb, err); return ERR_PTR(err); } int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); int nlen, rlen; unsigned int offset = 0; char *top; de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; if (ext4_match(dir, fname, de)) return -EEXIST; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) break; de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -ENOSPC; *dest_de = de; return 0; } void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname) { int nlen, rlen; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); de = de1; } de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); if (ext4_hash_in_dirent(dir)) { struct dx_hash_info *hinfo = &fname->hinfo; EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); EXT4_DIRENT_HASHES(de)->minor_hash = cpu_to_le32(hinfo->minor_hash); } } /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large * enough for new directory entry. If de is NULL, then * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err, err2; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { err = ext4_find_dest_de(dir, inode, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_std_error(dir->i_sb, err); return err; } /* By now the buffer is marked for journaling */ ext4_insert_dentry(dir, inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); return err ? err : err2; } static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root) { struct fake_dirent *fde; const char *error_msg; unsigned int rlen; unsigned int blocksize = dir->i_sb->s_blocksize; char *blockend = (char *)root + dir->i_sb->s_blocksize; fde = &root->dot; if (unlikely(fde->name_len != 1)) { error_msg = "invalid name_len for '.'"; goto corrupted; } if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) { error_msg = "invalid name for '.'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '.'"; goto corrupted; } fde = &root->dotdot; if (unlikely(fde->name_len != 2)) { error_msg = "invalid name_len for '..'"; goto corrupted; } if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) { error_msg = "invalid name for '..'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '..'"; goto corrupted; } return true; corrupted: EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended", error_msg); return false; } /* * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct buffer_head *bh) { struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; char *data2, *top; unsigned len; int retval; unsigned blocksize; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (retval) { ext4_std_error(dir->i_sb, retval); brelse(bh); return retval; } root = (struct dx_root *) bh->b_data; if (!ext4_check_dx_root(dir, root)) { brelse(bh); return -EFSCORRUPTED; } /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block); if (IS_ERR(bh2)) { brelse(bh); return PTR_ERR(bh2); } ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data2 = bh2->b_data; memcpy(data2, de, len); memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, (data2 + (blocksize - csum_size) - (char *) de))) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; } de = de2; } de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh2, blocksize); /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); de->rec_len = ext4_rec_len_to_disk( blocksize - ext4_dir_rec_len(2, NULL), blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); if (ext4_hash_in_dirent(dir)) root->info.hash_version = DX_HASH_SIPHASH; else root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; entries = root->entries; dx_set_block(entries, 1); dx_set_count(entries, 1); dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ fname->hinfo.hash_version = root->info.hash_version; if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* casefolded encrypted hashes are computed on fname setup */ if (!ext4_hash_in_dirent(dir)) { int err = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); if (err < 0) { brelse(bh2); brelse(bh); return err; } } memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; frame->at = entries; frame->bh = bh; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; retval = ext4_handle_dirty_dirblock(handle, dir, bh2); if (retval) goto out_frames; de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ if (retval) ext4_mark_inode_dirty(handle, dir); dx_release(frames); brelse(bh2); return retval; } /* * ext4_add_entry() * * adds a file entry to the specified directory, using the same * semantics as ext4_find_entry(). It returns NULL if it failed. * * NOTE!! The inode part of 'de' is left at 0 - which means you * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; ext4_lblk_t block, blocks; int csum_size = 0; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; #ifdef CONFIG_UNICODE if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) return retval; if (ext4_has_inline_data(dir)) { retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { retval = 0; goto out; } } if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ if (ext4_has_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; goto out; } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; retval = ext4_mark_inode_dirty(handle, dir); if (unlikely(retval)) goto out; } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); if (bh == NULL) { bh = ext4_bread(handle, dir, block, EXT4_GET_BLOCKS_CREATE); goto add_to_new_block; } if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } retval = add_dirent_to_buf(handle, &fname, dir, inode, NULL, bh); if (retval != -ENOSPC) goto out; if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; } brelse(bh); } bh = ext4_append(handle, dir, &block); add_to_new_block: if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh, blocksize); retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); out: ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); return retval; } /* * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int restart; int err; again: restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; at = frame->at; bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto cleanup; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto journal_error; err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; int levels = frame - frames + 1; unsigned int icount; int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; while (frame > frames) { if (dx_get_count((frame - 1)->entries) < dx_get_limit((frame - 1)->entries)) { add_level = 0; break; } frame--; /* split higher index block */ at = frame->at; entries = frame->entries; restart = 1; } if (add_level && levels == ext4_dir_htree_level(sb)) { ext4_warning(sb, "Directory (ino: %lu) index full, " "reach max htree level :%d", dir->i_ino, levels); if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ext4_warning(sb, "Large directory feature is " "not enabled on this " "filesystem"); } err = -ENOSPC; goto cleanup; } icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); goto cleanup; } node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); if (err) goto journal_error; memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); dx_set_count(entries, icount1); dx_set_count(entries2, icount2); dx_set_limit(entries2, dx_node_limit(dir)); /* Which index block gets the new entry? */ if (at - entries >= icount1) { frame->at = at - entries - icount1 + entries2; frame->entries = entries = entries2; swap(frame->bh, bh2); } dx_insert_block((frame - 1), hash2, newblock); dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (restart || err) goto journal_error; } else { struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); dxroot = (struct dx_root *)frames[0].bh->b_data; dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); /* @restart is true means htree-path has been changed, we need to * repeat dx_probe() to find out valid htree-path */ if (restart && err == 0) goto again; return err; } /* * ext4_generic_delete_entry deletes a directory entry by merging it * with the previous entry */ int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; int i; i = 0; pde = NULL; de = (struct ext4_dir_entry_2 *)entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { if (pde) { pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, blocksize) + ext4_rec_len_from_disk(de->rec_len, blocksize), blocksize); /* wipe entire dir_entry */ memset(de, 0, ext4_rec_len_from_disk(de->rec_len, blocksize)); } else { /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); } inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); pde = de; de = ext4_next_entry(de, blocksize); } return -ENOENT; } static int ext4_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh) { int err, csum_size = 0; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; err = ext4_delete_inline_entry(handle, dir, de_del, bh, &has_inline_data); if (has_inline_data) return err; } if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data, dir->i_sb->s_blocksize, csum_size); if (err) goto out; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (unlikely(err)) goto out; return 0; out: if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 * since this indicates that nlinks count was previously 1 to avoid overflowing * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean * that subdirectory link counts are not being maintained accurately. * * The caller has already checked for i_nlink overflow in case the DIR_LINK * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set * on regular files) and to avoid creating huge/slow non-HTREE directories. */ static void ext4_inc_count(struct inode *inode) { inc_nlink(inode); if (is_dx(inode) && (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) set_nlink(inode, 1); } /* * If a directory had nlink == 1, then we should let it be 1. This indicates * directory has >EXT4_LINK_MAX subdirs. */ static void ext4_dec_count(struct inode *inode) { if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) drop_nlink(inode); } /* * Add non-directory inode to a directory. On success, the inode reference is * consumed by dentry is instantiation. This is also indicated by clearing of * *inodep pointer. On failure, the caller is responsible for dropping the * inode reference in the safe context. */ static int ext4_add_nondir(handle_t *handle, struct dentry *dentry, struct inode **inodep) { struct inode *dir = d_inode(dentry->d_parent); struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); d_instantiate_new(dentry, inode); *inodep = NULL; return err; } drop_nlink(inode); ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; } /* * By the time this is called, we already have created * the directory cache entry for the new file, but it * is so far negative - it has no inode. * * If the create succeeds, we fill in the inode information * with d_instantiate(). */ static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, retries = 0; err = dquot_initialize(dir); if (err) return err; retry: inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 4 + EXT4_XATTR_TRANS_BLOCKS); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } if (handle) ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); return err; } struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, unsigned int parent_ino, int dotdot_real_len) { de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, "."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(parent_ino); de->name_len = 2; if (!dotdot_real_len) de->rec_len = ext4_rec_len_to_disk(blocksize - (csum_size + ext4_dir_rec_len(1, NULL)), blocksize); else de->rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, ".."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); return ext4_next_entry(de, blocksize); } int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { err = ext4_try_create_inline_dir(handle, dir, inode); if (err < 0 && err != -ENOSPC) goto out; if (!err) goto out; } inode->i_size = 0; dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); de = (struct ext4_dir_entry_2 *)dir_block->b_data; ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); set_nlink(inode, 2); if (csum_size) ext4_initialize_dirent_tail(dir_block, blocksize); BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) goto out; set_buffer_verified(dir_block); out: brelse(dir_block); return err; } static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2)) err = err2; ext4_journal_stop(handle); iput(inode); goto out_retry; } ext4_inc_count(dir); ext4_update_dx_flag(dir); err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; d_instantiate_new(dentry, inode); ext4_fc_track_create(handle, dentry); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); out_stop: if (handle) ext4_journal_stop(handle); out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } /* * routine to check that the specified directory is empty (for rmdir) */ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { int has_inline_data = 1; int ret; ret = empty_inline_dir(inode, &has_inline_data); if (has_inline_data) return ret; } sb = inode->i_sb; if (inode->i_size < ext4_dir_rec_len(1, NULL) + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); return false; } bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) return false; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); return false; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); bh = ext4_read_dirblock(inode, lblock, EITHER); if (bh == NULL) { offset += sb->s_blocksize; continue; } if (IS_ERR(bh)) return false; } de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode)) { brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } brelse(bh); return true; } static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; handle_t *handle = NULL; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) return -EIO; /* Initialize quotas before so that eventual writes go in * separate transaction */ retval = dquot_initialize(dir); if (retval) return retval; retval = dquot_initialize(d_inode(dentry)); if (retval) return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto end_rmdir; inode = d_inode(dentry); retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; if (!ext4_empty_dir(inode)) goto end_rmdir; handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rmdir; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) ext4_warning_inode(inode, "empty directory '%.*s' has too many links (%u)", dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode_inc_iversion(inode); clear_nlink(inode); /* There's no need to set i_disksize: the fact that i_nlink is * zero will ensure that the right thing happens during any * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; ext4_dec_count(dir); ext4_update_dx_flag(dir); ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif end_rmdir: brelse(bh); if (handle) ext4_journal_stop(handle); return retval; } int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry /* NULL during fast_commit recovery */) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; handle_t *handle; int skip_remove_dentry = 0; /* * Keep this outside the transaction; it may have to set up the * directory's encryption key, which isn't GFP_NOFS-safe. */ bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) return -ENOENT; if (le32_to_cpu(de->inode) != inode->i_ino) { /* * It's okay if we find dont find dentry which matches * the inode. That's because it might have gotten * renamed to a different inode number */ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else goto out_bh; } handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_bh; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) goto out_handle; } else { retval = 0; } if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", d_name->len, d_name->name); else drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); out_handle: ext4_journal_stop(handle); out_bh: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) return -EIO; trace_ext4_unlink_enter(dir, dentry); /* * Initialize quotas before so that eventual writes go * in separate transaction */ retval = dquot_initialize(dir); if (retval) goto out_trace; retval = dquot_initialize(d_inode(dentry)); if (retval) goto out_trace; retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif out_trace: trace_ext4_unlink_exit(dentry, retval); return retval; } static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; struct inode *inode; int err, len = strlen(symname); int credits; struct fscrypt_str disk_link; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); if (err) return err; err = dquot_initialize(dir); if (err) return err; if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, * group descriptor, sb, inode block, quota blocks, and * possibly selinux xattr blocks. */ credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS; } else { /* * Fast symlink. We have to add entry to directory * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), * allocate new inode (bitmap, group descriptor, inode block, * quota blocks, sb is already counted in previous macros). */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); return PTR_ERR(inode); } if (IS_ENCRYPTED(inode)) { err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) goto err_drop_inode; inode->i_op = &ext4_encrypted_symlink_inode_operations; } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { if (!IS_ENCRYPTED(inode)) inode->i_op = &ext4_symlink_inode_operations; inode_nohighmem(inode); ext4_set_aops(inode); /* * We cannot call page_symlink() with transaction started * because it calls into ext4_write_begin() which can wait * for transaction commit if we are running out of space * and thus we deadlock. So we have to stop transaction now * and restart it when symlink contents is written. * * To keep fs consistent in case of crash, we have to put inode * to orphan list in the mean time. */ drop_nlink(inode); err = ext4_orphan_add(handle, inode); if (handle) ext4_journal_stop(handle); handle = NULL; if (err) goto err_drop_inode; err = __page_symlink(inode, disk_link.name, disk_link.len, 1); if (err) goto err_drop_inode; /* * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified */ handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); handle = NULL; goto err_drop_inode; } set_nlink(inode, 1); err = ext4_orphan_del(handle, inode); if (err) goto err_drop_inode; } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); if (!IS_ENCRYPTED(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_link = (char *)&EXT4_I(inode)->i_data; } memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, disk_link.len); inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); if (inode) iput(inode); goto out_free_encrypted_link; err_drop_inode: if (handle) ext4_journal_stop(handle); clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); unlock_new_inode(inode); iput(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); return err; } int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) { handle_t *handle; int err, retries = 0; retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); inode->i_ctime = current_time(inode); ext4_inc_count(inode); ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time */ if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); ext4_fc_track_link(handle, dentry); } else { drop_nlink(inode); iput(inode); } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int err; if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; err = fscrypt_prepare_link(old_dentry, dir, dentry); if (err) return err; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; err = dquot_initialize(dir); if (err) return err; return __ext4_link(dir, inode, dentry); } /* * Try to find buffer head where contains the parent block. * It should be the inode block if it is inlined or the 1st block * if it is a normal dir. */ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct inode *inode, int *retval, struct ext4_dir_entry_2 **parent_de, int *inlined) { struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { struct ext4_dir_entry_2 *de; unsigned int offset; bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) { *retval = PTR_ERR(bh); return NULL; } de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { EXT4_ERROR_INODE(inode, "directory missing '.'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } offset = ext4_rec_len_from_disk(de->rec_len, inode->i_sb->s_blocksize); de = ext4_next_entry(de, inode->i_sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { EXT4_ERROR_INODE(inode, "directory missing '..'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } *parent_de = de; return bh; } *inlined = 1; return ext4_get_first_inline_block(inode, parent_de, retval); } struct ext4_renament { struct inode *dir; struct dentry *dentry; struct inode *inode; bool is_dir; int dir_nlink_delta; /* entry for "dentry" */ struct buffer_head *bh; struct ext4_dir_entry_2 *de; int inlined; /* entry for ".." in inode if it's a directory */ struct buffer_head *dir_bh; struct ext4_dir_entry_2 *parent_de; int dir_inlined; }; static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) { int retval; ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); if (!ent->dir_bh) return retval; if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); return ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->dir_bh, EXT4_JTR_NONE); } static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, unsigned dir_ino) { int retval; ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { if (is_dx(ent->inode)) { retval = ext4_handle_dirty_dx_node(handle, ent->inode, ent->dir_bh); } else { retval = ext4_handle_dirty_dirblock(handle, ent->inode, ent->dir_bh); } } else { retval = ext4_mark_inode_dirty(handle, ent->inode); } if (retval) { ext4_std_error(ent->dir->i_sb, retval); return retval; } return 0; } static int ext4_setent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, EXT4_JTR_NONE); if (retval) return retval; ent->de->inode = cpu_to_le32(ino); if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); ent->dir->i_ctime = ent->dir->i_mtime = current_time(ent->dir); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); if (unlikely(retval2)) { ext4_std_error(ent->dir->i_sb, retval2); return retval2; } } return retval; } static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { struct ext4_renament old = *ent; int retval = 0; /* * old->de could have moved from under us during make indexed dir, * so the old->de may no longer valid and need to find it again * before reset old inode info. */ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) retval = PTR_ERR(old.bh); if (!old.bh) retval = -ENOENT; if (retval) { ext4_std_error(old.dir->i_sb, retval); return; } ext4_setent(handle, &old, ino, file_type); brelse(old.bh); } static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (bh) { retval = ext4_delete_entry(handle, dir, de, bh); brelse(bh); } return retval; } static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, int force_reread) { int retval; /* * ent->de could have moved from under us during htree split, so make * sure that we are deleting the right entry. We might also be pointing * to a stale entry in the unused part of ent->bh so just checking inum * and the name isn't enough. */ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || ent->de->name_len != ent->dentry->d_name.len || strncmp(ent->de->name, ent->dentry->d_name.name, ent->de->name_len) || force_reread) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } else { retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); if (retval == -ENOENT) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } } if (retval) { ext4_warning_inode(ent->dir, "Deleting old file: nlink %d, error=%d", ent->dir->i_nlink, retval); } } static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) { if (ent->dir_nlink_delta) { if (ent->dir_nlink_delta == -1) ext4_dec_count(ent->dir); else ext4_inc_count(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); } } static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, struct ext4_renament *ent, int credits, handle_t **h) { struct inode *wh; handle_t *handle; int retries = 0; /* * for inode block, sb block, group summaries, * and inode bitmap */ credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: wh = ext4_new_inode_start_handle(mnt_userns, ent->dir, S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(wh)) { if (handle) ext4_journal_stop(handle); if (PTR_ERR(wh) == -ENOSPC && ext4_should_retry_alloc(ent->dir->i_sb, &retries)) goto retry; } else { *h = handle; init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); wh->i_op = &ext4_special_inode_operations; } return wh; } /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. * * n.b. old_{dentry,inode) refers to the source dentry/inode * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; int force_reread; int retval; struct inode *whiteout = NULL; int credits; u8 old_file_type; if (new.inode && new.inode->i_nlink == 0) { EXT4_ERROR_INODE(new.inode, "target of rename is already freed"); return -EFSCORRUPTED; } if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(old.inode); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new.inode) { retval = dquot_initialize(new.inode); if (retval) return retval; } old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto release_bh; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto release_bh; } if (new.bh) { if (!new.inode) { brelse(new.bh); new.bh = NULL; } } if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) ext4_alloc_da_blocks(old.inode); credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); if (!(flags & RENAME_WHITEOUT)) { handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto release_bh; } } else { whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); goto release_bh; } } old_file_type = old.de->file_type; if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { if (new.inode) { retval = -ENOTEMPTY; if (!ext4_empty_dir(new.inode)) goto end_rename; } else { retval = -EMLINK; if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } retval = ext4_rename_dir_prepare(handle, &old); if (retval) goto end_rename; } /* * If we're renaming a file within an inline_data dir and adding or * setting the new dirent causes a conversion from inline_data to * extents/blockmap, we need to force the dirent delete code to * re-read the directory, or else we end up trying to delete a dirent * from what is now the extent tree root (or a block map). */ force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); if (whiteout) { /* * Do this before adding a new entry, so the old entry is sure * to be still pointing to the valid old entry. */ retval = ext4_setent(handle, &old, whiteout->i_ino, EXT4_FT_CHRDEV); if (retval) goto end_rename; retval = ext4_mark_inode_dirty(handle, whiteout); if (unlikely(retval)) goto end_rename; } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { retval = ext4_setent(handle, &new, old.inode->i_ino, old_file_type); if (retval) goto end_rename; } if (force_reread) force_reread = !ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA); /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ old.inode->i_ctime = current_time(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; if (!whiteout) { /* * ok, that's it */ ext4_rename_delete(handle, &old, force_reread); } if (new.inode) { ext4_dec_count(new.inode); new.inode->i_ctime = current_time(new.inode); } old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; ext4_dec_count(old.dir); if (new.inode) { /* checked ext4_empty_dir above, can't have another * parent, ext4_dec_count() won't work for many-linked * dirs */ clear_nlink(new.inode); } else { ext4_inc_count(new.dir); ext4_update_dx_flag(new.dir); retval = ext4_mark_inode_dirty(handle, new.dir); if (unlikely(retval)) goto end_rename; } } retval = ext4_mark_inode_dirty(handle, old.dir); if (unlikely(retval)) goto end_rename; if (S_ISDIR(old.inode->i_mode)) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, EXT4_FC_REASON_RENAME_DIR, handle); } else { struct super_block *sb = old.inode->i_sb; if (new.inode) ext4_fc_track_unlink(handle, new.dentry); if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) { __ext4_fc_track_link(handle, old.inode, new.dentry); __ext4_fc_track_unlink(handle, old.inode, old.dentry); if (whiteout) __ext4_fc_track_create(handle, whiteout, old.dentry); } } if (new.inode) { retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; if (!new.inode->i_nlink) ext4_orphan_add(handle, new.inode); } retval = 0; end_rename: if (whiteout) { if (retval) { ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); ext4_journal_stop(handle); iput(whiteout); } else { ext4_journal_stop(handle); } release_bh: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); return retval; } static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; u8 new_file_type; int retval; struct timespec64 ctime; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid)) || (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(old_dir)->i_projid, EXT4_I(new_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto end_rename; } /* RENAME_EXCHANGE case: old *and* new must both exist */ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) goto end_rename; handle = ext4_journal_start(old.dir, EXT4_HT_DIR, (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rename; } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { old.is_dir = true; retval = ext4_rename_dir_prepare(handle, &old); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { new.is_dir = true; retval = ext4_rename_dir_prepare(handle, &new); if (retval) goto end_rename; } /* * Other than the special case of overwriting a directory, parents' * nlink only needs to be modified if this is a cross directory rename. */ if (old.dir != new.dir && old.is_dir != new.is_dir) { old.dir_nlink_delta = old.is_dir ? -1 : 1; new.dir_nlink_delta = -old.dir_nlink_delta; retval = -EMLINK; if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) goto end_rename; } new_file_type = new.de->file_type; retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); if (retval) goto end_rename; retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); if (retval) goto end_rename; /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ ctime = current_time(old.inode); old.inode->i_ctime = ctime; new.inode->i_ctime = ctime; retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; } if (new.dir_bh) { retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); if (retval) goto end_rename; } ext4_update_dir_count(handle, &old); ext4_update_dir_count(handle, &new); retval = 0; end_rename: brelse(old.dir_bh); brelse(new.dir_bh); brelse(old.bh); brelse(new.bh); if (handle) ext4_journal_stop(handle); return retval; } static int ext4_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int err; if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (err) return err; if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); } return ext4_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags); } /* * directories can handle most operations... */ const struct inode_operations ext4_dir_inode_operations = { .create = ext4_create, .lookup = ext4_lookup, .link = ext4_link, .unlink = ext4_unlink, .symlink = ext4_symlink, .mkdir = ext4_mkdir, .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, .rename = ext4_rename2, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_acl = ext4_get_acl, .set_acl = ext4_set_acl, };
394 392 394 394 395 115 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 // SPDX-License-Identifier: GPL-2.0 /* * bus.c - bus driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. */ #include <linux/async.h> #include <linux/device/bus.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/sysfs.h> #include "base.h" #include "power/power.h" /* /sys/devices/system */ static struct kset *system_kset; #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr) /* * sysfs bindings for drivers */ #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr) #define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct driver_attribute driver_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); static struct bus_type *bus_get(struct bus_type *bus) { if (bus) { kset_get(&bus->p->subsys); return bus; } return NULL; } static void bus_put(struct bus_type *bus) { if (bus) kset_put(&bus->p->subsys); } static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->show) ret = drv_attr->show(drv_priv->driver, buf); return ret; } static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->store) ret = drv_attr->store(drv_priv->driver, buf, count); return ret; } static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; static void driver_release(struct kobject *kobj) { struct driver_private *drv_priv = to_driver(kobj); pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__); kfree(drv_priv); } static struct kobj_type driver_ktype = { .sysfs_ops = &driver_sysfs_ops, .release = driver_release, }; /* * sysfs bindings for buses */ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for reading a bus attribute without show() */ ssize_t ret = -EIO; if (bus_attr->show) ret = bus_attr->show(subsys_priv->bus, buf); return ret; } static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for writing a bus attribute without store() */ ssize_t ret = -EIO; if (bus_attr->store) ret = bus_attr->store(subsys_priv->bus, buf, count); return ret; } static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; int bus_create_file(struct bus_type *bus, struct bus_attribute *attr) { int error; if (bus_get(bus)) { error = sysfs_create_file(&bus->p->subsys.kobj, &attr->attr); bus_put(bus); } else error = -EINVAL; return error; } EXPORT_SYMBOL_GPL(bus_create_file); void bus_remove_file(struct bus_type *bus, struct bus_attribute *attr) { if (bus_get(bus)) { sysfs_remove_file(&bus->p->subsys.kobj, &attr->attr); bus_put(bus); } } EXPORT_SYMBOL_GPL(bus_remove_file); static void bus_release(struct kobject *kobj) { struct subsys_private *priv = to_subsys_private(kobj); struct bus_type *bus = priv->bus; kfree(priv); bus->p = NULL; } static struct kobj_type bus_ktype = { .sysfs_ops = &bus_sysfs_ops, .release = bus_release, }; static int bus_uevent_filter(struct kset *kset, struct kobject *kobj) { struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; return 0; } static const struct kset_uevent_ops bus_uevent_ops = { .filter = bus_uevent_filter, }; static struct kset *bus_kset; /* Manually detach a device from its associated driver. */ static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) { struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { device_driver_detach(dev); err = count; } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store); /* * Manually attach a device to a driver. * Note: the driver must want to bind to the device, * it is not possible to override the driver's id table. */ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) { struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && driver_match_device(drv, dev)) { err = device_driver_attach(drv, dev); if (!err) { /* success */ err = count; } } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store); static ssize_t drivers_autoprobe_show(struct bus_type *bus, char *buf) { return sysfs_emit(buf, "%d\n", bus->p->drivers_autoprobe); } static ssize_t drivers_autoprobe_store(struct bus_type *bus, const char *buf, size_t count) { if (buf[0] == '0') bus->p->drivers_autoprobe = 0; else bus->p->drivers_autoprobe = 1; return count; } static ssize_t drivers_probe_store(struct bus_type *bus, const char *buf, size_t count) { struct device *dev; int err = -EINVAL; dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) == 0) err = count; put_device(dev); return err; } static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; } return dev; } /** * bus_for_each_dev - device iterator. * @bus: bus type. * @start: device to start iterating from. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over @bus's list of devices, and call @fn for each, * passing it @data. If @start is not NULL, we use that device to * begin iterating from. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * NOTE: The device that returns a non-zero value is not retained * in any way, nor is its refcount incremented. If the caller needs * to retain this data, it should do so, and increment the reference * count in the supplied callback. */ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *, void *)) { struct klist_iter i; struct device *dev; int error = 0; if (!bus || !bus->p) return -EINVAL; klist_iter_init_node(&bus->p->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); return error; } EXPORT_SYMBOL_GPL(bus_for_each_dev); /** * bus_find_device - device iterator for locating a particular device. * @bus: bus type * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *bus_find_device(struct bus_type *bus, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)) { struct klist_iter i; struct device *dev; if (!bus || !bus->p) return NULL; klist_iter_init_node(&bus->p->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while ((dev = next_device(&i))) if (match(dev, data) && get_device(dev)) break; klist_iter_exit(&i); return dev; } EXPORT_SYMBOL_GPL(bus_find_device); /** * subsys_find_device_by_id - find a device with a specific enumeration number * @subsys: subsystem * @id: index 'id' in struct device * @hint: device to check first * * Check the hint's next object and if it is a match return it directly, * otherwise, fall back to a full list search. Either way a reference for * the returned object is taken. */ struct device *subsys_find_device_by_id(struct bus_type *subsys, unsigned int id, struct device *hint) { struct klist_iter i; struct device *dev; if (!subsys) return NULL; if (hint) { klist_iter_init_node(&subsys->p->klist_devices, &i, &hint->p->knode_bus); dev = next_device(&i); if (dev && dev->id == id && get_device(dev)) { klist_iter_exit(&i); return dev; } klist_iter_exit(&i); } klist_iter_init_node(&subsys->p->klist_devices, &i, NULL); while ((dev = next_device(&i))) { if (dev->id == id && get_device(dev)) { klist_iter_exit(&i); return dev; } } klist_iter_exit(&i); return NULL; } EXPORT_SYMBOL_GPL(subsys_find_device_by_id); static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct driver_private *drv_priv; if (n) { drv_priv = container_of(n, struct driver_private, knode_bus); return drv_priv->driver; } return NULL; } /** * bus_for_each_drv - driver iterator * @bus: bus we're dealing with. * @start: driver to start iterating on. * @data: data to pass to the callback. * @fn: function to call for each driver. * * This is nearly identical to the device iterator above. * We iterate over each driver that belongs to @bus, and call * @fn for each. If @fn returns anything but 0, we break out * and return it. If @start is not NULL, we use it as the head * of the list. * * NOTE: we don't return the driver that returns a non-zero * value, nor do we leave the reference count incremented for that * driver. If the caller needs to know that info, it must set it * in the callback. It must also be sure to increment the refcount * so it doesn't disappear before returning to the caller. */ int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)) { struct klist_iter i; struct device_driver *drv; int error = 0; if (!bus) return -EINVAL; klist_iter_init_node(&bus->p->klist_drivers, &i, start ? &start->p->knode_bus : NULL); while ((drv = next_driver(&i)) && !error) error = fn(drv, data); klist_iter_exit(&i); return error; } EXPORT_SYMBOL_GPL(bus_for_each_drv); /** * bus_add_device - add device to bus * @dev: device being added * * - Add device's bus attributes. * - Create links to device's bus. * - Add the device to its bus's list of devices. */ int bus_add_device(struct device *dev) { struct bus_type *bus = bus_get(dev->bus); int error = 0; if (bus) { pr_debug("bus: '%s': add device %s\n", bus->name, dev_name(dev)); error = device_add_groups(dev, bus->dev_groups); if (error) goto out_put; error = sysfs_create_link(&bus->p->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) goto out_groups; error = sysfs_create_link(&dev->kobj, &dev->bus->p->subsys.kobj, "subsystem"); if (error) goto out_subsys; klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices); } return 0; out_subsys: sysfs_remove_link(&bus->p->devices_kset->kobj, dev_name(dev)); out_groups: device_remove_groups(dev, bus->dev_groups); out_put: bus_put(dev->bus); return error; } /** * bus_probe_device - probe drivers for a new device * @dev: device to probe * * - Automatically probe for a driver if the bus allows it. */ void bus_probe_device(struct device *dev) { struct bus_type *bus = dev->bus; struct subsys_interface *sif; if (!bus) return; if (bus->p->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&bus->p->mutex); } /** * bus_remove_device - remove device from bus * @dev: device to be removed * * - Remove device from all interfaces. * - Remove symlink from bus' directory. * - Delete device from bus's list. * - Detach from its driver. * - Drop reference taken in bus_add_device(). */ void bus_remove_device(struct device *dev) { struct bus_type *bus = dev->bus; struct subsys_interface *sif; if (!bus) return; mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&bus->p->mutex); sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&dev->bus->p->devices_kset->kobj, dev_name(dev)); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); pr_debug("bus: '%s': remove device %s\n", dev->bus->name, dev_name(dev)); device_release_driver(dev); bus_put(dev->bus); } static int __must_check add_bind_files(struct device_driver *drv) { int ret; ret = driver_create_file(drv, &driver_attr_unbind); if (ret == 0) { ret = driver_create_file(drv, &driver_attr_bind); if (ret) driver_remove_file(drv, &driver_attr_unbind); } return ret; } static void remove_bind_files(struct device_driver *drv) { driver_remove_file(drv, &driver_attr_bind); driver_remove_file(drv, &driver_attr_unbind); } static BUS_ATTR_WO(drivers_probe); static BUS_ATTR_RW(drivers_autoprobe); static int add_probe_files(struct bus_type *bus) { int retval; retval = bus_create_file(bus, &bus_attr_drivers_probe); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_drivers_autoprobe); if (retval) bus_remove_file(bus, &bus_attr_drivers_probe); out: return retval; } static void remove_probe_files(struct bus_type *bus) { bus_remove_file(bus, &bus_attr_drivers_autoprobe); bus_remove_file(bus, &bus_attr_drivers_probe); } static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&drv->p->kobj, buf, count); return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); /** * bus_add_driver - Add a driver to the bus. * @drv: driver. */ int bus_add_driver(struct device_driver *drv) { struct bus_type *bus; struct driver_private *priv; int error = 0; bus = bus_get(drv->bus); if (!bus) return -EINVAL; pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); priv->driver = drv; drv->p = priv; priv->kobj.kset = bus->p->drivers_kset; error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); if (drv->bus->p->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_del_list; } module_add_driver(drv->owner, drv); error = driver_create_file(drv, &driver_attr_uevent); if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, bus->drv_groups); if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_add_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0; out_del_list: klist_del(&priv->knode_bus); out_unregister: kobject_put(&priv->kobj); /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: bus_put(bus); return error; } /** * bus_remove_driver - delete driver from bus's knowledge. * @drv: driver. * * Detach the driver from the devices it controls, and remove * it from its bus's list of drivers. Finally, we drop the reference * to the bus we took in bus_add_driver(). */ void bus_remove_driver(struct device_driver *drv) { if (!drv->bus) return; if (!drv->suppress_bind_attrs) remove_bind_files(drv); driver_remove_groups(drv, drv->bus->drv_groups); driver_remove_file(drv, &driver_attr_uevent); klist_remove(&drv->p->knode_bus); pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name); driver_detach(drv); module_remove_driver(drv); kobject_put(&drv->p->kobj); bus_put(drv->bus); } /* Helper for bus_rescan_devices's iter */ static int __must_check bus_rescan_devices_helper(struct device *dev, void *data) { int ret = 0; if (!dev->driver) { if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); ret = device_attach(dev); if (dev->parent && dev->bus->need_parent_lock) device_unlock(dev->parent); } return ret < 0 ? ret : 0; } /** * bus_rescan_devices - rescan devices on the bus for possible drivers * @bus: the bus to scan. * * This function will look for devices on the bus with no driver * attached and rescan it against existing drivers to see if it matches * any by calling device_attach() for the unbound devices. */ int bus_rescan_devices(struct bus_type *bus) { return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } EXPORT_SYMBOL_GPL(bus_rescan_devices); /** * device_reprobe - remove driver for a device and probe for a new driver * @dev: the device to reprobe * * This function detaches the attached driver (if any) for the given * device and restarts the driver probing process. It is intended * to use if probing criteria changed during a devices lifetime and * driver attachment should change accordingly. */ int device_reprobe(struct device *dev) { if (dev->driver) device_driver_detach(dev); return bus_rescan_devices_helper(dev, NULL); } EXPORT_SYMBOL_GPL(device_reprobe); static int bus_add_groups(struct bus_type *bus, const struct attribute_group **groups) { return sysfs_create_groups(&bus->p->subsys.kobj, groups); } static void bus_remove_groups(struct bus_type *bus, const struct attribute_group **groups) { sysfs_remove_groups(&bus->p->subsys.kobj, groups); } static void klist_devices_get(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; get_device(dev); } static void klist_devices_put(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; put_device(dev); } static ssize_t bus_uevent_store(struct bus_type *bus, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&bus->p->subsys.kobj, buf, count); return rc ? rc : count; } /* * "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO() * here, but can not use it as earlier in the file we have * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store * function name. */ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL, bus_uevent_store); /** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */ int bus_register(struct bus_type *bus) { int retval; struct subsys_private *priv; struct lock_class_key *key = &bus->lock_key; priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; bus->p = priv; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name); if (retval) goto out; priv->subsys.kobj.kset = bus_kset; priv->subsys.kobj.ktype = &bus_ktype; priv->drivers_autoprobe = 1; retval = kset_register(&priv->subsys); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, &priv->subsys.kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, &priv->subsys.kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } INIT_LIST_HEAD(&priv->interfaces); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); if (retval) goto bus_probe_files_fail; retval = bus_add_groups(bus, bus->bus_groups); if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; bus_groups_fail: remove_probe_files(bus); bus_probe_files_fail: kset_unregister(bus->p->drivers_kset); bus_drivers_fail: kset_unregister(bus->p->devices_kset); bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent); bus_uevent_fail: kset_unregister(&bus->p->subsys); /* Above kset_unregister() will kfree @bus->p */ bus->p = NULL; out: kfree(bus->p); bus->p = NULL; return retval; } EXPORT_SYMBOL_GPL(bus_register); /** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */ void bus_unregister(struct bus_type *bus) { pr_debug("bus: '%s': unregistering\n", bus->name); if (bus->dev_root) device_unregister(bus->dev_root); bus_remove_groups(bus, bus->bus_groups); remove_probe_files(bus); kset_unregister(bus->p->drivers_kset); kset_unregister(bus->p->devices_kset); bus_remove_file(bus, &bus_attr_uevent); kset_unregister(&bus->p->subsys); } EXPORT_SYMBOL_GPL(bus_unregister); int bus_register_notifier(struct bus_type *bus, struct notifier_block *nb) { return blocking_notifier_chain_register(&bus->p->bus_notifier, nb); } EXPORT_SYMBOL_GPL(bus_register_notifier); int bus_unregister_notifier(struct bus_type *bus, struct notifier_block *nb) { return blocking_notifier_chain_unregister(&bus->p->bus_notifier, nb); } EXPORT_SYMBOL_GPL(bus_unregister_notifier); struct kset *bus_get_kset(struct bus_type *bus) { return &bus->p->subsys; } EXPORT_SYMBOL_GPL(bus_get_kset); struct klist *bus_get_device_klist(struct bus_type *bus) { return &bus->p->klist_devices; } EXPORT_SYMBOL_GPL(bus_get_device_klist); /* * Yes, this forcibly breaks the klist abstraction temporarily. It * just wants to sort the klist, not change reference counts and * take/drop locks rapidly in the process. It does all this while * holding the lock for the list, so objects can't otherwise be * added/removed while we're swizzling. */ static void device_insertion_sort_klist(struct device *a, struct list_head *list, int (*compare)(const struct device *a, const struct device *b)) { struct klist_node *n; struct device_private *dev_prv; struct device *b; list_for_each_entry(n, list, n_node) { dev_prv = to_device_private_bus(n); b = dev_prv->device; if (compare(a, b) <= 0) { list_move_tail(&a->p->knode_bus.n_node, &b->p->knode_bus.n_node); return; } } list_move_tail(&a->p->knode_bus.n_node, list); } void bus_sort_breadthfirst(struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)) { LIST_HEAD(sorted_devices); struct klist_node *n, *tmp; struct device_private *dev_prv; struct device *dev; struct klist *device_klist; device_klist = bus_get_device_klist(bus); spin_lock(&device_klist->k_lock); list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; device_insertion_sort_klist(dev, &sorted_devices, compare); } list_splice(&sorted_devices, &device_klist->k_list); spin_unlock(&device_klist->k_lock); } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); /** * subsys_dev_iter_init - initialize subsys device iterator * @iter: subsys iterator to initialize * @subsys: the subsys we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize subsys iterator @iter such that it iterates over devices * of @subsys. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct bus_type *subsys, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_bus; klist_iter_init_node(&subsys->p->klist_devices, &iter->ki, start_knode); iter->type = type; } EXPORT_SYMBOL_GPL(subsys_dev_iter_init); /** * subsys_dev_iter_next - iterate to the next device * @iter: subsys iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into subsys code. */ struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter) { struct klist_node *knode; struct device *dev; for (;;) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = to_device_private_bus(knode)->device; if (!iter->type || iter->type == dev->type) return dev; } } EXPORT_SYMBOL_GPL(subsys_dev_iter_next); /** * subsys_dev_iter_exit - finish iteration * @iter: subsys iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ void subsys_dev_iter_exit(struct subsys_dev_iter *iter) { klist_iter_exit(&iter->ki); } EXPORT_SYMBOL_GPL(subsys_dev_iter_exit); int subsys_interface_register(struct subsys_interface *sif) { struct bus_type *subsys; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return -ENODEV; subsys = bus_get(sif->subsys); if (!subsys) return -EINVAL; mutex_lock(&subsys->p->mutex); list_add_tail(&sif->node, &subsys->p->interfaces); if (sif->add_dev) { subsys_dev_iter_init(&iter, subsys, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->add_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&subsys->p->mutex); return 0; } EXPORT_SYMBOL_GPL(subsys_interface_register); void subsys_interface_unregister(struct subsys_interface *sif) { struct bus_type *subsys; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return; subsys = sif->subsys; mutex_lock(&subsys->p->mutex); list_del_init(&sif->node); if (sif->remove_dev) { subsys_dev_iter_init(&iter, subsys, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->remove_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&subsys->p->mutex); bus_put(subsys); } EXPORT_SYMBOL_GPL(subsys_interface_unregister); static void system_root_device_release(struct device *dev) { kfree(dev); } static int subsys_register(struct bus_type *subsys, const struct attribute_group **groups, struct kobject *parent_of_root) { struct device *dev; int err; err = bus_register(subsys); if (err < 0) return err; dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto err_dev; } err = dev_set_name(dev, "%s", subsys->name); if (err < 0) goto err_name; dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; err = device_register(dev); if (err < 0) goto err_dev_reg; subsys->dev_root = dev; return 0; err_dev_reg: put_device(dev); dev = NULL; err_name: kfree(dev); err_dev: bus_unregister(subsys); return err; } /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem * @groups: default attributes for the root device * * All 'system' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem- * wide attributes. All registered devices are below this single root * device and are named after the subsystem with a simple enumeration * number appended. The registered devices are not explicitly named; * only 'id' in the device needs to be set. * * Do not use this interface for anything new, it exists for compatibility * with bad ideas only. New subsystems should use plain subsystems; and * add the subsystem-wide attributes should be added to the subsystem * directory itself and not some create fake root-device placed in * /sys/devices/system/<name>. */ int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups) { return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); /** * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ * @subsys: virtual subsystem * @groups: default attributes for the root device * * All 'virtual' subsystems have a /sys/devices/system/<name> root device * with the name of the subystem. The root device can carry subsystem-wide * attributes. All registered devices are below this single root device. * There's no restriction on device naming. This is for kernel software * constructs which need sysfs interface. */ int subsys_virtual_register(struct bus_type *subsys, const struct attribute_group **groups) { struct kobject *virtual_dir; virtual_dir = virtual_device_parent(NULL); if (!virtual_dir) return -ENOMEM; return subsys_register(subsys, groups, virtual_dir); } EXPORT_SYMBOL_GPL(subsys_virtual_register); int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); if (!bus_kset) return -ENOMEM; system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); if (!system_kset) return -ENOMEM; return 0; }
27 27 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 debugfs for wireless PHYs * * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2018 - 2019, 2021 Intel Corporation */ #include <linux/debugfs.h> #include <linux/rtnetlink.h> #include <linux/vmalloc.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "debugfs.h" #define DEBUGFS_FORMAT_BUFFER_SIZE 100 int mac80211_format_buffer(char __user *userbuf, size_t count, loff_t *ppos, char *fmt, ...) { va_list args; char buf[DEBUGFS_FORMAT_BUFFER_SIZE]; int res; va_start(args, fmt); res = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); return simple_read_from_buffer(userbuf, count, ppos, buf, res); } #define DEBUGFS_READONLY_FILE_FN(name, fmt, value...) \ static ssize_t name## _read(struct file *file, char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct ieee80211_local *local = file->private_data; \ \ return mac80211_format_buffer(userbuf, count, ppos, \ fmt "\n", ##value); \ } #define DEBUGFS_READONLY_FILE_OPS(name) \ static const struct file_operations name## _ops = { \ .read = name## _read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ }; #define DEBUGFS_READONLY_FILE(name, fmt, value...) \ DEBUGFS_READONLY_FILE_FN(name, fmt, value) \ DEBUGFS_READONLY_FILE_OPS(name) #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, phyd, local, &name## _ops) #define DEBUGFS_ADD_MODE(name, mode) \ debugfs_create_file(#name, mode, phyd, local, &name## _ops); DEBUGFS_READONLY_FILE(hw_conf, "%x", local->hw.conf.flags); DEBUGFS_READONLY_FILE(user_power, "%d", local->user_power_level); DEBUGFS_READONLY_FILE(power, "%d", local->hw.conf.power_level); DEBUGFS_READONLY_FILE(total_ps_buffered, "%d", local->total_ps_buffered); DEBUGFS_READONLY_FILE(wep_iv, "%#08x", local->wep_iv & 0xffffff); DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); static ssize_t aqm_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; struct fq *fq = &local->fq; char buf[200]; int len = 0; spin_lock_bh(&local->fq.lock); rcu_read_lock(); len = scnprintf(buf, sizeof(buf), "access name value\n" "R fq_flows_cnt %u\n" "R fq_backlog %u\n" "R fq_overlimit %u\n" "R fq_overmemory %u\n" "R fq_collisions %u\n" "R fq_memory_usage %u\n" "RW fq_memory_limit %u\n" "RW fq_limit %u\n" "RW fq_quantum %u\n", fq->flows_cnt, fq->backlog, fq->overmemory, fq->overlimit, fq->collisions, fq->memory_usage, fq->memory_limit, fq->limit, fq->quantum); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aqm_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[100]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; else if (sscanf(buf, "fq_memory_limit %u", &local->fq.memory_limit) == 1) return count; else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1) return count; return -EINVAL; } static const struct file_operations aqm_ops = { .write = aqm_write, .read = aqm_read, .open = simple_open, .llseek = default_llseek, }; static ssize_t airtime_flags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[128] = {}, *pos, *end; pos = buf; end = pos + sizeof(buf) - 1; if (local->airtime_flags & AIRTIME_USE_TX) pos += scnprintf(pos, end - pos, "AIRTIME_TX\t(%lx)\n", AIRTIME_USE_TX); if (local->airtime_flags & AIRTIME_USE_RX) pos += scnprintf(pos, end - pos, "AIRTIME_RX\t(%lx)\n", AIRTIME_USE_RX); return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } static ssize_t airtime_flags_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[16]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (kstrtou16(buf, 0, &local->airtime_flags)) return -EINVAL; return count; } static const struct file_operations airtime_flags_ops = { .write = airtime_flags_write, .read = airtime_flags_read, .open = simple_open, .llseek = default_llseek, }; static ssize_t aql_txq_limit_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[400]; int len = 0; len = scnprintf(buf, sizeof(buf), "AC AQL limit low AQL limit high\n" "VO %u %u\n" "VI %u %u\n" "BE %u %u\n" "BK %u %u\n", local->airtime[IEEE80211_AC_VO].aql_txq_limit_low, local->airtime[IEEE80211_AC_VO].aql_txq_limit_high, local->airtime[IEEE80211_AC_VI].aql_txq_limit_low, local->airtime[IEEE80211_AC_VI].aql_txq_limit_high, local->airtime[IEEE80211_AC_BE].aql_txq_limit_low, local->airtime[IEEE80211_AC_BE].aql_txq_limit_high, local->airtime[IEEE80211_AC_BK].aql_txq_limit_low, local->airtime[IEEE80211_AC_BK].aql_txq_limit_high); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aql_txq_limit_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[100]; u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; struct sta_info *sta; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) return -EINVAL; if (ac >= IEEE80211_NUM_ACS) return -EINVAL; q_limit_low_old = local->airtime[ac].aql_txq_limit_low; q_limit_high_old = local->airtime[ac].aql_txq_limit_high; local->airtime[ac].aql_txq_limit_low = q_limit_low; local->airtime[ac].aql_txq_limit_high = q_limit_high; mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { /* If a sta has customized queue limits, keep it */ if (sta->airtime[ac].aql_limit_low == q_limit_low_old && sta->airtime[ac].aql_limit_high == q_limit_high_old) { sta->airtime[ac].aql_limit_low = q_limit_low; sta->airtime[ac].aql_limit_high = q_limit_high; } } mutex_unlock(&local->sta_mtx); return count; } static const struct file_operations aql_txq_limit_ops = { .write = aql_txq_limit_write, .read = aql_txq_limit_read, .open = simple_open, .llseek = default_llseek, }; static ssize_t aql_enable_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; int len; len = scnprintf(buf, sizeof(buf), "%d\n", !static_key_false(&aql_disable.key)); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aql_enable_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool aql_disabled = static_key_false(&aql_disable.key); char buf[3]; size_t len; if (count > sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; buf[sizeof(buf) - 1] = '\0'; len = strlen(buf); if (len > 0 && buf[len - 1] == '\n') buf[len - 1] = 0; if (buf[0] == '0' && buf[1] == '\0') { if (!aql_disabled) static_branch_inc(&aql_disable); } else if (buf[0] == '1' && buf[1] == '\0') { if (aql_disabled) static_branch_dec(&aql_disable); } else { return -EINVAL; } return count; } static const struct file_operations aql_enable_ops = { .write = aql_enable_write, .read = aql_enable_read, .open = simple_open, .llseek = default_llseek, }; static ssize_t force_tx_status_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[3]; int len = 0; len = scnprintf(buf, sizeof(buf), "%d\n", (int)local->force_tx_status); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t force_tx_status_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[3]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (buf[0] == '0' && buf[1] == '\0') local->force_tx_status = 0; else if (buf[0] == '1' && buf[1] == '\0') local->force_tx_status = 1; else return -EINVAL; return count; } static const struct file_operations force_tx_status_ops = { .write = force_tx_status_write, .read = force_tx_status_read, .open = simple_open, .llseek = default_llseek, }; static ssize_t airtime_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[200]; u64 v_t[IEEE80211_NUM_ACS]; u64 wt[IEEE80211_NUM_ACS]; int len = 0, ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->airtime[ac].lock); v_t[ac] = local->airtime[ac].v_t; wt[ac] = local->airtime[ac].weight_sum; spin_unlock_bh(&local->airtime[ac].lock); } len = scnprintf(buf, sizeof(buf), "\tVO VI BE BK\n" "Virt-t\t%-10llu %-10llu %-10llu %-10llu\n" "Weight\t%-10llu %-10llu %-10llu %-10llu\n", v_t[0], v_t[1], v_t[2], v_t[3], wt[0], wt[1], wt[2], wt[3]); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static const struct file_operations airtime_ops = { .read = airtime_read, .open = simple_open, .llseek = default_llseek, }; #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; int ret; rtnl_lock(); wiphy_lock(local->hw.wiphy); __ieee80211_suspend(&local->hw, NULL); ret = __ieee80211_resume(&local->hw); wiphy_unlock(local->hw.wiphy); if (ret) cfg80211_shutdown_all_interfaces(local->hw.wiphy); rtnl_unlock(); return count; } static const struct file_operations reset_ops = { .write = reset_write, .open = simple_open, .llseek = noop_llseek, }; #endif static const char *hw_flag_names[] = { #define FLAG(F) [IEEE80211_HW_##F] = #F FLAG(HAS_RATE_CONTROL), FLAG(RX_INCLUDES_FCS), FLAG(HOST_BROADCAST_PS_BUFFERING), FLAG(SIGNAL_UNSPEC), FLAG(SIGNAL_DBM), FLAG(NEED_DTIM_BEFORE_ASSOC), FLAG(SPECTRUM_MGMT), FLAG(AMPDU_AGGREGATION), FLAG(SUPPORTS_PS), FLAG(PS_NULLFUNC_STACK), FLAG(SUPPORTS_DYNAMIC_PS), FLAG(MFP_CAPABLE), FLAG(WANT_MONITOR_VIF), FLAG(NO_AUTO_VIF), FLAG(SW_CRYPTO_CONTROL), FLAG(SUPPORT_FAST_XMIT), FLAG(REPORTS_TX_ACK_STATUS), FLAG(CONNECTION_MONITOR), FLAG(QUEUE_CONTROL), FLAG(SUPPORTS_PER_STA_GTK), FLAG(AP_LINK_PS), FLAG(TX_AMPDU_SETUP_IN_HW), FLAG(SUPPORTS_RC_TABLE), FLAG(P2P_DEV_ADDR_FOR_INTF), FLAG(TIMING_BEACON_ONLY), FLAG(SUPPORTS_HT_CCK_RATES), FLAG(CHANCTX_STA_CSA), FLAG(SUPPORTS_CLONED_SKBS), FLAG(SINGLE_SCAN_ON_ALL_BANDS), FLAG(TDLS_WIDER_BW), FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), FLAG(SUPPORTS_REORDERING_BUFFER), FLAG(USES_RSS), FLAG(TX_AMSDU), FLAG(TX_FRAG_LIST), FLAG(REPORTS_LOW_ACK), FLAG(SUPPORTS_TX_FRAG), FLAG(SUPPORTS_TDLS_BUFFER_STA), FLAG(DEAUTH_NEED_MGD_TX_PREP), FLAG(DOESNT_SUPPORT_QOS_NDP), FLAG(BUFF_MMPDU_TXQ), FLAG(SUPPORTS_VHT_EXT_NSS_BW), FLAG(STA_MMPDU_TXQ), FLAG(TX_STATUS_NO_AMPDU_LEN), FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(AMPDU_KEYBORDER_SUPPORT), FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), FLAG(SUPPORTS_RX_DECAP_OFFLOAD), FLAG(SUPPORTS_CONC_MON_RX_DECAP), #undef FLAG }; static ssize_t hwflags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; size_t bufsz = 30 * NUM_IEEE80211_HW_FLAGS; char *buf = kzalloc(bufsz, GFP_KERNEL); char *pos = buf, *end = buf + bufsz - 1; ssize_t rv; int i; if (!buf) return -ENOMEM; /* fail compilation if somebody adds or removes * a flag without updating the name array above */ BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS); for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) pos += scnprintf(pos, end - pos, "%s\n", hw_flag_names[i]); } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); return rv; } static ssize_t misc_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */ size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9; char *buf; char *pos, *end; ssize_t rv; int i; int ln; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; pos = buf; end = buf + bufsz - 1; pos += scnprintf(pos, end - pos, "pending:\n"); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { ln = skb_queue_len(&local->pending[i]); pos += scnprintf(pos, end - pos, "[%i] %d\n", i, ln); } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); return rv; } static ssize_t queues_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; unsigned long flags; char buf[IEEE80211_MAX_QUEUES * 20]; int q, res = 0; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (q = 0; q < local->hw.queues; q++) res += sprintf(buf + res, "%02d: %#.8lx/%d\n", q, local->queue_stop_reasons[q], skb_queue_len(&local->pending[q])); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return simple_read_from_buffer(user_buf, count, ppos, buf, res); } DEBUGFS_READONLY_FILE_OPS(hwflags); DEBUGFS_READONLY_FILE_OPS(queues); DEBUGFS_READONLY_FILE_OPS(misc); /* statistics stuff */ static ssize_t format_devstat_counter(struct ieee80211_local *local, char __user *userbuf, size_t count, loff_t *ppos, int (*printvalue)(struct ieee80211_low_level_stats *stats, char *buf, int buflen)) { struct ieee80211_low_level_stats stats; char buf[20]; int res; rtnl_lock(); res = drv_get_stats(local, &stats); rtnl_unlock(); if (res) return res; res = printvalue(&stats, buf, sizeof(buf)); return simple_read_from_buffer(userbuf, count, ppos, buf, res); } #define DEBUGFS_DEVSTATS_FILE(name) \ static int print_devstats_##name(struct ieee80211_low_level_stats *stats,\ char *buf, int buflen) \ { \ return scnprintf(buf, buflen, "%u\n", stats->name); \ } \ static ssize_t stats_ ##name## _read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ return format_devstat_counter(file->private_data, \ userbuf, \ count, \ ppos, \ print_devstats_##name); \ } \ \ static const struct file_operations stats_ ##name## _ops = { \ .read = stats_ ##name## _read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ }; #define DEBUGFS_STATS_ADD(name) \ debugfs_create_u32(#name, 0400, statsd, &local->name); #define DEBUGFS_DEVSTATS_ADD(name) \ debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops); DEBUGFS_DEVSTATS_FILE(dot11ACKFailureCount); DEBUGFS_DEVSTATS_FILE(dot11RTSFailureCount); DEBUGFS_DEVSTATS_FILE(dot11FCSErrorCount); DEBUGFS_DEVSTATS_FILE(dot11RTSSuccessCount); void debugfs_hw_add(struct ieee80211_local *local) { struct dentry *phyd = local->hw.wiphy->debugfsdir; struct dentry *statsd; if (!phyd) return; local->debugfs.keys = debugfs_create_dir("keys", phyd); DEBUGFS_ADD(total_ps_buffered); DEBUGFS_ADD(wep_iv); DEBUGFS_ADD(rate_ctrl_alg); DEBUGFS_ADD(queues); DEBUGFS_ADD(misc); #ifdef CONFIG_PM DEBUGFS_ADD_MODE(reset, 0200); #endif DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); DEBUGFS_ADD(hw_conf); DEBUGFS_ADD_MODE(force_tx_status, 0600); DEBUGFS_ADD_MODE(aql_enable, 0600); if (local->ops->wake_tx_queue) DEBUGFS_ADD_MODE(aqm, 0600); if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) { DEBUGFS_ADD_MODE(airtime, 0600); DEBUGFS_ADD_MODE(airtime_flags, 0600); } DEBUGFS_ADD(aql_txq_limit); debugfs_create_u32("aql_threshold", 0600, phyd, &local->aql_threshold); statsd = debugfs_create_dir("statistics", phyd); /* if the dir failed, don't put all the other things into the root! */ if (!statsd) return; #ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount); DEBUGFS_STATS_ADD(dot11FailedCount); DEBUGFS_STATS_ADD(dot11RetryCount); DEBUGFS_STATS_ADD(dot11MultipleRetryCount); DEBUGFS_STATS_ADD(dot11FrameDuplicateCount); DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); DEBUGFS_STATS_ADD(tx_handlers_drop); DEBUGFS_STATS_ADD(tx_handlers_queued); DEBUGFS_STATS_ADD(tx_handlers_drop_wep); DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port); DEBUGFS_STATS_ADD(rx_handlers_drop); DEBUGFS_STATS_ADD(rx_handlers_queued); DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc); DEBUGFS_STATS_ADD(rx_handlers_drop_defrag); DEBUGFS_STATS_ADD(tx_expand_skb_head); DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned); DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag); DEBUGFS_STATS_ADD(rx_handlers_fragments); DEBUGFS_STATS_ADD(tx_status_drop); #endif DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount); DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount); DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount); DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount); }
14 13 24 19 5 13 18 19 19 19 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "noise.h" #include "device.h" #include "peer.h" #include "messages.h" #include "queueing.h" #include "peerlookup.h" #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/bitmap.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <crypto/algapi.h> /* This implements Noise_IKpsk2: * * <- s * ****** * -> e, es, s, ss, {t} * <- e, ee, se, psk, {} */ static const u8 handshake_name[37] = "Noise_IKpsk2_25519_ChaChaPoly_BLAKE2s"; static const u8 identifier_name[34] = "WireGuard v1 zx2c4 Jason@zx2c4.com"; static u8 handshake_init_hash[NOISE_HASH_LEN] __ro_after_init; static u8 handshake_init_chaining_key[NOISE_HASH_LEN] __ro_after_init; static atomic64_t keypair_counter = ATOMIC64_INIT(0); void __init wg_noise_init(void) { struct blake2s_state blake; blake2s(handshake_init_chaining_key, handshake_name, NULL, NOISE_HASH_LEN, sizeof(handshake_name), 0); blake2s_init(&blake, NOISE_HASH_LEN); blake2s_update(&blake, handshake_init_chaining_key, NOISE_HASH_LEN); blake2s_update(&blake, identifier_name, sizeof(identifier_name)); blake2s_final(&blake, handshake_init_hash); } /* Must hold peer->handshake.static_identity->lock */ void wg_noise_precompute_static_static(struct wg_peer *peer) { down_write(&peer->handshake.lock); if (!peer->handshake.static_identity->has_identity || !curve25519(peer->handshake.precomputed_static_static, peer->handshake.static_identity->static_private, peer->handshake.remote_static)) memset(peer->handshake.precomputed_static_static, 0, NOISE_PUBLIC_KEY_LEN); up_write(&peer->handshake.lock); } void wg_noise_handshake_init(struct noise_handshake *handshake, struct noise_static_identity *static_identity, const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN], const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN], struct wg_peer *peer) { memset(handshake, 0, sizeof(*handshake)); init_rwsem(&handshake->lock); handshake->entry.type = INDEX_HASHTABLE_HANDSHAKE; handshake->entry.peer = peer; memcpy(handshake->remote_static, peer_public_key, NOISE_PUBLIC_KEY_LEN); if (peer_preshared_key) memcpy(handshake->preshared_key, peer_preshared_key, NOISE_SYMMETRIC_KEY_LEN); handshake->static_identity = static_identity; handshake->state = HANDSHAKE_ZEROED; wg_noise_precompute_static_static(peer); } static void handshake_zero(struct noise_handshake *handshake) { memset(&handshake->ephemeral_private, 0, NOISE_PUBLIC_KEY_LEN); memset(&handshake->remote_ephemeral, 0, NOISE_PUBLIC_KEY_LEN); memset(&handshake->hash, 0, NOISE_HASH_LEN); memset(&handshake->chaining_key, 0, NOISE_HASH_LEN); handshake->remote_index = 0; handshake->state = HANDSHAKE_ZEROED; } void wg_noise_handshake_clear(struct noise_handshake *handshake) { down_write(&handshake->lock); wg_index_hashtable_remove( handshake->entry.peer->device->index_hashtable, &handshake->entry); handshake_zero(handshake); up_write(&handshake->lock); } static struct noise_keypair *keypair_create(struct wg_peer *peer) { struct noise_keypair *keypair = kzalloc(sizeof(*keypair), GFP_KERNEL); if (unlikely(!keypair)) return NULL; spin_lock_init(&keypair->receiving_counter.lock); keypair->internal_id = atomic64_inc_return(&keypair_counter); keypair->entry.type = INDEX_HASHTABLE_KEYPAIR; keypair->entry.peer = peer; kref_init(&keypair->refcount); return keypair; } static void keypair_free_rcu(struct rcu_head *rcu) { kfree_sensitive(container_of(rcu, struct noise_keypair, rcu)); } static void keypair_free_kref(struct kref *kref) { struct noise_keypair *keypair = container_of(kref, struct noise_keypair, refcount); net_dbg_ratelimited("%s: Keypair %llu destroyed for peer %llu\n", keypair->entry.peer->device->dev->name, keypair->internal_id, keypair->entry.peer->internal_id); wg_index_hashtable_remove(keypair->entry.peer->device->index_hashtable, &keypair->entry); call_rcu(&keypair->rcu, keypair_free_rcu); } void wg_noise_keypair_put(struct noise_keypair *keypair, bool unreference_now) { if (unlikely(!keypair)) return; if (unlikely(unreference_now)) wg_index_hashtable_remove( keypair->entry.peer->device->index_hashtable, &keypair->entry); kref_put(&keypair->refcount, keypair_free_kref); } struct noise_keypair *wg_noise_keypair_get(struct noise_keypair *keypair) { RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Taking noise keypair reference without holding the RCU BH read lock"); if (unlikely(!keypair || !kref_get_unless_zero(&keypair->refcount))) return NULL; return keypair; } void wg_noise_keypairs_clear(struct noise_keypairs *keypairs) { struct noise_keypair *old; spin_lock_bh(&keypairs->keypair_update_lock); /* We zero the next_keypair before zeroing the others, so that * wg_noise_received_with_keypair returns early before subsequent ones * are zeroed. */ old = rcu_dereference_protected(keypairs->next_keypair, lockdep_is_held(&keypairs->keypair_update_lock)); RCU_INIT_POINTER(keypairs->next_keypair, NULL); wg_noise_keypair_put(old, true); old = rcu_dereference_protected(keypairs->previous_keypair, lockdep_is_held(&keypairs->keypair_update_lock)); RCU_INIT_POINTER(keypairs->previous_keypair, NULL); wg_noise_keypair_put(old, true); old = rcu_dereference_protected(keypairs->current_keypair, lockdep_is_held(&keypairs->keypair_update_lock)); RCU_INIT_POINTER(keypairs->current_keypair, NULL); wg_noise_keypair_put(old, true); spin_unlock_bh(&keypairs->keypair_update_lock); } void wg_noise_expire_current_peer_keypairs(struct wg_peer *peer) { struct noise_keypair *keypair; wg_noise_handshake_clear(&peer->handshake); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); spin_lock_bh(&peer->keypairs.keypair_update_lock); keypair = rcu_dereference_protected(peer->keypairs.next_keypair, lockdep_is_held(&peer->keypairs.keypair_update_lock)); if (keypair) keypair->sending.is_valid = false; keypair = rcu_dereference_protected(peer->keypairs.current_keypair, lockdep_is_held(&peer->keypairs.keypair_update_lock)); if (keypair) keypair->sending.is_valid = false; spin_unlock_bh(&peer->keypairs.keypair_update_lock); } static void add_new_keypair(struct noise_keypairs *keypairs, struct noise_keypair *new_keypair) { struct noise_keypair *previous_keypair, *next_keypair, *current_keypair; spin_lock_bh(&keypairs->keypair_update_lock); previous_keypair = rcu_dereference_protected(keypairs->previous_keypair, lockdep_is_held(&keypairs->keypair_update_lock)); next_keypair = rcu_dereference_protected(keypairs->next_keypair, lockdep_is_held(&keypairs->keypair_update_lock)); current_keypair = rcu_dereference_protected(keypairs->current_keypair, lockdep_is_held(&keypairs->keypair_update_lock)); if (new_keypair->i_am_the_initiator) { /* If we're the initiator, it means we've sent a handshake, and * received a confirmation response, which means this new * keypair can now be used. */ if (next_keypair) { /* If there already was a next keypair pending, we * demote it to be the previous keypair, and free the * existing current. Note that this means KCI can result * in this transition. It would perhaps be more sound to * always just get rid of the unused next keypair * instead of putting it in the previous slot, but this * might be a bit less robust. Something to think about * for the future. */ RCU_INIT_POINTER(keypairs->next_keypair, NULL); rcu_assign_pointer(keypairs->previous_keypair, next_keypair); wg_noise_keypair_put(current_keypair, true); } else /* If there wasn't an existing next keypair, we replace * the previous with the current one. */ rcu_assign_pointer(keypairs->previous_keypair, current_keypair); /* At this point we can get rid of the old previous keypair, and * set up the new keypair. */ wg_noise_keypair_put(previous_keypair, true); rcu_assign_pointer(keypairs->current_keypair, new_keypair); } else { /* If we're the responder, it means we can't use the new keypair * until we receive confirmation via the first data packet, so * we get rid of the existing previous one, the possibly * existing next one, and slide in the new next one. */ rcu_assign_pointer(keypairs->next_keypair, new_keypair); wg_noise_keypair_put(next_keypair, true); RCU_INIT_POINTER(keypairs->previous_keypair, NULL); wg_noise_keypair_put(previous_keypair, true); } spin_unlock_bh(&keypairs->keypair_update_lock); } bool wg_noise_received_with_keypair(struct noise_keypairs *keypairs, struct noise_keypair *received_keypair) { struct noise_keypair *old_keypair; bool key_is_new; /* We first check without taking the spinlock. */ key_is_new = received_keypair == rcu_access_pointer(keypairs->next_keypair); if (likely(!key_is_new)) return false; spin_lock_bh(&keypairs->keypair_update_lock); /* After locking, we double check that things didn't change from * beneath us. */ if (unlikely(received_keypair != rcu_dereference_protected(keypairs->next_keypair, lockdep_is_held(&keypairs->keypair_update_lock)))) { spin_unlock_bh(&keypairs->keypair_update_lock); return false; } /* When we've finally received the confirmation, we slide the next * into the current, the current into the previous, and get rid of * the old previous. */ old_keypair = rcu_dereference_protected(keypairs->previous_keypair, lockdep_is_held(&keypairs->keypair_update_lock)); rcu_assign_pointer(keypairs->previous_keypair, rcu_dereference_protected(keypairs->current_keypair, lockdep_is_held(&keypairs->keypair_update_lock))); wg_noise_keypair_put(old_keypair, true); rcu_assign_pointer(keypairs->current_keypair, received_keypair); RCU_INIT_POINTER(keypairs->next_keypair, NULL); spin_unlock_bh(&keypairs->keypair_update_lock); return true; } /* Must hold static_identity->lock */ void wg_noise_set_static_identity_private_key( struct noise_static_identity *static_identity, const u8 private_key[NOISE_PUBLIC_KEY_LEN]) { memcpy(static_identity->static_private, private_key, NOISE_PUBLIC_KEY_LEN); curve25519_clamp_secret(static_identity->static_private); static_identity->has_identity = curve25519_generate_public( static_identity->static_public, private_key); } static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen) { struct blake2s_state state; u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 }; u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32)); int i; if (keylen > BLAKE2S_BLOCK_SIZE) { blake2s_init(&state, BLAKE2S_HASH_SIZE); blake2s_update(&state, key, keylen); blake2s_final(&state, x_key); } else memcpy(x_key, key, keylen); for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) x_key[i] ^= 0x36; blake2s_init(&state, BLAKE2S_HASH_SIZE); blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); blake2s_update(&state, in, inlen); blake2s_final(&state, i_hash); for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) x_key[i] ^= 0x5c ^ 0x36; blake2s_init(&state, BLAKE2S_HASH_SIZE); blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE); blake2s_final(&state, i_hash); memcpy(out, i_hash, BLAKE2S_HASH_SIZE); memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE); memzero_explicit(i_hash, BLAKE2S_HASH_SIZE); } /* This is Hugo Krawczyk's HKDF: * - https://eprint.iacr.org/2010/264.pdf * - https://tools.ietf.org/html/rfc5869 */ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data, size_t first_len, size_t second_len, size_t third_len, size_t data_len, const u8 chaining_key[NOISE_HASH_LEN]) { u8 output[BLAKE2S_HASH_SIZE + 1]; u8 secret[BLAKE2S_HASH_SIZE]; WARN_ON(IS_ENABLED(DEBUG) && (first_len > BLAKE2S_HASH_SIZE || second_len > BLAKE2S_HASH_SIZE || third_len > BLAKE2S_HASH_SIZE || ((second_len || second_dst || third_len || third_dst) && (!first_len || !first_dst)) || ((third_len || third_dst) && (!second_len || !second_dst)))); /* Extract entropy from data into secret */ hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN); if (!first_dst || !first_len) goto out; /* Expand first key: key = secret, data = 0x1 */ output[0] = 1; hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE); memcpy(first_dst, output, first_len); if (!second_dst || !second_len) goto out; /* Expand second key: key = secret, data = first-key || 0x2 */ output[BLAKE2S_HASH_SIZE] = 2; hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE); memcpy(second_dst, output, second_len); if (!third_dst || !third_len) goto out; /* Expand third key: key = secret, data = second-key || 0x3 */ output[BLAKE2S_HASH_SIZE] = 3; hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE); memcpy(third_dst, output, third_len); out: /* Clear sensitive data from stack */ memzero_explicit(secret, BLAKE2S_HASH_SIZE); memzero_explicit(output, BLAKE2S_HASH_SIZE + 1); } static void derive_keys(struct noise_symmetric_key *first_dst, struct noise_symmetric_key *second_dst, const u8 chaining_key[NOISE_HASH_LEN]) { u64 birthdate = ktime_get_coarse_boottime_ns(); kdf(first_dst->key, second_dst->key, NULL, NULL, NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, 0, chaining_key); first_dst->birthdate = second_dst->birthdate = birthdate; first_dst->is_valid = second_dst->is_valid = true; } static bool __must_check mix_dh(u8 chaining_key[NOISE_HASH_LEN], u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 private[NOISE_PUBLIC_KEY_LEN], const u8 public[NOISE_PUBLIC_KEY_LEN]) { u8 dh_calculation[NOISE_PUBLIC_KEY_LEN]; if (unlikely(!curve25519(dh_calculation, private, public))) return false; kdf(chaining_key, key, NULL, dh_calculation, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, chaining_key); memzero_explicit(dh_calculation, NOISE_PUBLIC_KEY_LEN); return true; } static bool __must_check mix_precomputed_dh(u8 chaining_key[NOISE_HASH_LEN], u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 precomputed[NOISE_PUBLIC_KEY_LEN]) { static u8 zero_point[NOISE_PUBLIC_KEY_LEN]; if (unlikely(!crypto_memneq(precomputed, zero_point, NOISE_PUBLIC_KEY_LEN))) return false; kdf(chaining_key, key, NULL, precomputed, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, chaining_key); return true; } static void mix_hash(u8 hash[NOISE_HASH_LEN], const u8 *src, size_t src_len) { struct blake2s_state blake; blake2s_init(&blake, NOISE_HASH_LEN); blake2s_update(&blake, hash, NOISE_HASH_LEN); blake2s_update(&blake, src, src_len); blake2s_final(&blake, hash); } static void mix_psk(u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN], u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 psk[NOISE_SYMMETRIC_KEY_LEN]) { u8 temp_hash[NOISE_HASH_LEN]; kdf(chaining_key, temp_hash, key, psk, NOISE_HASH_LEN, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, chaining_key); mix_hash(hash, temp_hash, NOISE_HASH_LEN); memzero_explicit(temp_hash, NOISE_HASH_LEN); } static void handshake_init(u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN], const u8 remote_static[NOISE_PUBLIC_KEY_LEN]) { memcpy(hash, handshake_init_hash, NOISE_HASH_LEN); memcpy(chaining_key, handshake_init_chaining_key, NOISE_HASH_LEN); mix_hash(hash, remote_static, NOISE_PUBLIC_KEY_LEN); } static void message_encrypt(u8 *dst_ciphertext, const u8 *src_plaintext, size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 hash[NOISE_HASH_LEN]) { chacha20poly1305_encrypt(dst_ciphertext, src_plaintext, src_len, hash, NOISE_HASH_LEN, 0 /* Always zero for Noise_IK */, key); mix_hash(hash, dst_ciphertext, noise_encrypted_len(src_len)); } static bool message_decrypt(u8 *dst_plaintext, const u8 *src_ciphertext, size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 hash[NOISE_HASH_LEN]) { if (!chacha20poly1305_decrypt(dst_plaintext, src_ciphertext, src_len, hash, NOISE_HASH_LEN, 0 /* Always zero for Noise_IK */, key)) return false; mix_hash(hash, src_ciphertext, src_len); return true; } static void message_ephemeral(u8 ephemeral_dst[NOISE_PUBLIC_KEY_LEN], const u8 ephemeral_src[NOISE_PUBLIC_KEY_LEN], u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN]) { if (ephemeral_dst != ephemeral_src) memcpy(ephemeral_dst, ephemeral_src, NOISE_PUBLIC_KEY_LEN); mix_hash(hash, ephemeral_src, NOISE_PUBLIC_KEY_LEN); kdf(chaining_key, NULL, NULL, ephemeral_src, NOISE_HASH_LEN, 0, 0, NOISE_PUBLIC_KEY_LEN, chaining_key); } static void tai64n_now(u8 output[NOISE_TIMESTAMP_LEN]) { struct timespec64 now; ktime_get_real_ts64(&now); /* In order to prevent some sort of infoleak from precise timers, we * round down the nanoseconds part to the closest rounded-down power of * two to the maximum initiations per second allowed anyway by the * implementation. */ now.tv_nsec = ALIGN_DOWN(now.tv_nsec, rounddown_pow_of_two(NSEC_PER_SEC / INITIATIONS_PER_SECOND)); /* https://cr.yp.to/libtai/tai64.html */ *(__be64 *)output = cpu_to_be64(0x400000000000000aULL + now.tv_sec); *(__be32 *)(output + sizeof(__be64)) = cpu_to_be32(now.tv_nsec); } bool wg_noise_handshake_create_initiation(struct message_handshake_initiation *dst, struct noise_handshake *handshake) { u8 timestamp[NOISE_TIMESTAMP_LEN]; u8 key[NOISE_SYMMETRIC_KEY_LEN]; bool ret = false; /* We need to wait for crng _before_ taking any locks, since * curve25519_generate_secret uses get_random_bytes_wait. */ wait_for_random_bytes(); down_read(&handshake->static_identity->lock); down_write(&handshake->lock); if (unlikely(!handshake->static_identity->has_identity)) goto out; dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION); handshake_init(handshake->chaining_key, handshake->hash, handshake->remote_static); /* e */ curve25519_generate_secret(handshake->ephemeral_private); if (!curve25519_generate_public(dst->unencrypted_ephemeral, handshake->ephemeral_private)) goto out; message_ephemeral(dst->unencrypted_ephemeral, dst->unencrypted_ephemeral, handshake->chaining_key, handshake->hash); /* es */ if (!mix_dh(handshake->chaining_key, key, handshake->ephemeral_private, handshake->remote_static)) goto out; /* s */ message_encrypt(dst->encrypted_static, handshake->static_identity->static_public, NOISE_PUBLIC_KEY_LEN, key, handshake->hash); /* ss */ if (!mix_precomputed_dh(handshake->chaining_key, key, handshake->precomputed_static_static)) goto out; /* {t} */ tai64n_now(timestamp); message_encrypt(dst->encrypted_timestamp, timestamp, NOISE_TIMESTAMP_LEN, key, handshake->hash); dst->sender_index = wg_index_hashtable_insert( handshake->entry.peer->device->index_hashtable, &handshake->entry); handshake->state = HANDSHAKE_CREATED_INITIATION; ret = true; out: up_write(&handshake->lock); up_read(&handshake->static_identity->lock); memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN); return ret; } struct wg_peer * wg_noise_handshake_consume_initiation(struct message_handshake_initiation *src, struct wg_device *wg) { struct wg_peer *peer = NULL, *ret_peer = NULL; struct noise_handshake *handshake; bool replay_attack, flood_attack; u8 key[NOISE_SYMMETRIC_KEY_LEN]; u8 chaining_key[NOISE_HASH_LEN]; u8 hash[NOISE_HASH_LEN]; u8 s[NOISE_PUBLIC_KEY_LEN]; u8 e[NOISE_PUBLIC_KEY_LEN]; u8 t[NOISE_TIMESTAMP_LEN]; u64 initiation_consumption; down_read(&wg->static_identity.lock); if (unlikely(!wg->static_identity.has_identity)) goto out; handshake_init(chaining_key, hash, wg->static_identity.static_public); /* e */ message_ephemeral(e, src->unencrypted_ephemeral, chaining_key, hash); /* es */ if (!mix_dh(chaining_key, key, wg->static_identity.static_private, e)) goto out; /* s */ if (!message_decrypt(s, src->encrypted_static, sizeof(src->encrypted_static), key, hash)) goto out; /* Lookup which peer we're actually talking to */ peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, s); if (!peer) goto out; handshake = &peer->handshake; /* ss */ if (!mix_precomputed_dh(chaining_key, key, handshake->precomputed_static_static)) goto out; /* {t} */ if (!message_decrypt(t, src->encrypted_timestamp, sizeof(src->encrypted_timestamp), key, hash)) goto out; down_read(&handshake->lock); replay_attack = memcmp(t, handshake->latest_timestamp, NOISE_TIMESTAMP_LEN) <= 0; flood_attack = (s64)handshake->last_initiation_consumption + NSEC_PER_SEC / INITIATIONS_PER_SECOND > (s64)ktime_get_coarse_boottime_ns(); up_read(&handshake->lock); if (replay_attack || flood_attack) goto out; /* Success! Copy everything to peer */ down_write(&handshake->lock); memcpy(handshake->remote_ephemeral, e, NOISE_PUBLIC_KEY_LEN); if (memcmp(t, handshake->latest_timestamp, NOISE_TIMESTAMP_LEN) > 0) memcpy(handshake->latest_timestamp, t, NOISE_TIMESTAMP_LEN); memcpy(handshake->hash, hash, NOISE_HASH_LEN); memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN); handshake->remote_index = src->sender_index; initiation_consumption = ktime_get_coarse_boottime_ns(); if ((s64)(handshake->last_initiation_consumption - initiation_consumption) < 0) handshake->last_initiation_consumption = initiation_consumption; handshake->state = HANDSHAKE_CONSUMED_INITIATION; up_write(&handshake->lock); ret_peer = peer; out: memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN); memzero_explicit(hash, NOISE_HASH_LEN); memzero_explicit(chaining_key, NOISE_HASH_LEN); up_read(&wg->static_identity.lock); if (!ret_peer) wg_peer_put(peer); return ret_peer; } bool wg_noise_handshake_create_response(struct message_handshake_response *dst, struct noise_handshake *handshake) { u8 key[NOISE_SYMMETRIC_KEY_LEN]; bool ret = false; /* We need to wait for crng _before_ taking any locks, since * curve25519_generate_secret uses get_random_bytes_wait. */ wait_for_random_bytes(); down_read(&handshake->static_identity->lock); down_write(&handshake->lock); if (handshake->state != HANDSHAKE_CONSUMED_INITIATION) goto out; dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE); dst->receiver_index = handshake->remote_index; /* e */ curve25519_generate_secret(handshake->ephemeral_private); if (!curve25519_generate_public(dst->unencrypted_ephemeral, handshake->ephemeral_private)) goto out; message_ephemeral(dst->unencrypted_ephemeral, dst->unencrypted_ephemeral, handshake->chaining_key, handshake->hash); /* ee */ if (!mix_dh(handshake->chaining_key, NULL, handshake->ephemeral_private, handshake->remote_ephemeral)) goto out; /* se */ if (!mix_dh(handshake->chaining_key, NULL, handshake->ephemeral_private, handshake->remote_static)) goto out; /* psk */ mix_psk(handshake->chaining_key, handshake->hash, key, handshake->preshared_key); /* {} */ message_encrypt(dst->encrypted_nothing, NULL, 0, key, handshake->hash); dst->sender_index = wg_index_hashtable_insert( handshake->entry.peer->device->index_hashtable, &handshake->entry); handshake->state = HANDSHAKE_CREATED_RESPONSE; ret = true; out: up_write(&handshake->lock); up_read(&handshake->static_identity->lock); memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN); return ret; } struct wg_peer * wg_noise_handshake_consume_response(struct message_handshake_response *src, struct wg_device *wg) { enum noise_handshake_state state = HANDSHAKE_ZEROED; struct wg_peer *peer = NULL, *ret_peer = NULL; struct noise_handshake *handshake; u8 key[NOISE_SYMMETRIC_KEY_LEN]; u8 hash[NOISE_HASH_LEN]; u8 chaining_key[NOISE_HASH_LEN]; u8 e[NOISE_PUBLIC_KEY_LEN]; u8 ephemeral_private[NOISE_PUBLIC_KEY_LEN]; u8 static_private[NOISE_PUBLIC_KEY_LEN]; u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]; down_read(&wg->static_identity.lock); if (unlikely(!wg->static_identity.has_identity)) goto out; handshake = (struct noise_handshake *)wg_index_hashtable_lookup( wg->index_hashtable, INDEX_HASHTABLE_HANDSHAKE, src->receiver_index, &peer); if (unlikely(!handshake)) goto out; down_read(&handshake->lock); state = handshake->state; memcpy(hash, handshake->hash, NOISE_HASH_LEN); memcpy(chaining_key, handshake->chaining_key, NOISE_HASH_LEN); memcpy(ephemeral_private, handshake->ephemeral_private, NOISE_PUBLIC_KEY_LEN); memcpy(preshared_key, handshake->preshared_key, NOISE_SYMMETRIC_KEY_LEN); up_read(&handshake->lock); if (state != HANDSHAKE_CREATED_INITIATION) goto fail; /* e */ message_ephemeral(e, src->unencrypted_ephemeral, chaining_key, hash); /* ee */ if (!mix_dh(chaining_key, NULL, ephemeral_private, e)) goto fail; /* se */ if (!mix_dh(chaining_key, NULL, wg->static_identity.static_private, e)) goto fail; /* psk */ mix_psk(chaining_key, hash, key, preshared_key); /* {} */ if (!message_decrypt(NULL, src->encrypted_nothing, sizeof(src->encrypted_nothing), key, hash)) goto fail; /* Success! Copy everything to peer */ down_write(&handshake->lock); /* It's important to check that the state is still the same, while we * have an exclusive lock. */ if (handshake->state != state) { up_write(&handshake->lock); goto fail; } memcpy(handshake->remote_ephemeral, e, NOISE_PUBLIC_KEY_LEN); memcpy(handshake->hash, hash, NOISE_HASH_LEN); memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN); handshake->remote_index = src->sender_index; handshake->state = HANDSHAKE_CONSUMED_RESPONSE; up_write(&handshake->lock); ret_peer = peer; goto out; fail: wg_peer_put(peer); out: memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN); memzero_explicit(hash, NOISE_HASH_LEN); memzero_explicit(chaining_key, NOISE_HASH_LEN); memzero_explicit(ephemeral_private, NOISE_PUBLIC_KEY_LEN); memzero_explicit(static_private, NOISE_PUBLIC_KEY_LEN); memzero_explicit(preshared_key, NOISE_SYMMETRIC_KEY_LEN); up_read(&wg->static_identity.lock); return ret_peer; } bool wg_noise_handshake_begin_session(struct noise_handshake *handshake, struct noise_keypairs *keypairs) { struct noise_keypair *new_keypair; bool ret = false; down_write(&handshake->lock); if (handshake->state != HANDSHAKE_CREATED_RESPONSE && handshake->state != HANDSHAKE_CONSUMED_RESPONSE) goto out; new_keypair = keypair_create(handshake->entry.peer); if (!new_keypair) goto out; new_keypair->i_am_the_initiator = handshake->state == HANDSHAKE_CONSUMED_RESPONSE; new_keypair->remote_index = handshake->remote_index; if (new_keypair->i_am_the_initiator) derive_keys(&new_keypair->sending, &new_keypair->receiving, handshake->chaining_key); else derive_keys(&new_keypair->receiving, &new_keypair->sending, handshake->chaining_key); handshake_zero(handshake); rcu_read_lock_bh(); if (likely(!READ_ONCE(container_of(handshake, struct wg_peer, handshake)->is_dead))) { add_new_keypair(keypairs, new_keypair); net_dbg_ratelimited("%s: Keypair %llu created for peer %llu\n", handshake->entry.peer->device->dev->name, new_keypair->internal_id, handshake->entry.peer->internal_id); ret = wg_index_hashtable_replace( handshake->entry.peer->device->index_hashtable, &handshake->entry, &new_keypair->entry); } else { kfree_sensitive(new_keypair); } rcu_read_unlock_bh(); out: up_write(&handshake->lock); return ret; }
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 // SPDX-License-Identifier: GPL-2.0 /* Bareudp: UDP tunnel encasulation for different Payload types like * MPLS, NSH, IP, etc. * Copyright (c) 2019 Nokia, Inc. * Authors: Martin Varghese, <martin.varghese@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/hash.h> #include <net/dst_metadata.h> #include <net/gro_cells.h> #include <net/rtnetlink.h> #include <net/protocol.h> #include <net/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/udp_tunnel.h> #include <net/bareudp.h> #define BAREUDP_BASE_HLEN sizeof(struct udphdr) #define BAREUDP_IPV4_HLEN (sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define BAREUDP_IPV6_HLEN (sizeof(struct ipv6hdr) + \ sizeof(struct udphdr)) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); /* per-network namespace private data for this module */ static unsigned int bareudp_net_id; struct bareudp_net { struct list_head bareudp_list; }; /* Pseudo network device */ struct bareudp_dev { struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for bareudp tunnel */ __be16 ethertype; __be16 port; u16 sport_min; bool multi_proto_mode; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; }; static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct metadata_dst *tun_dst = NULL; struct bareudp_dev *bareudp; unsigned short family; unsigned int len; __be16 proto; void *oiph; int err; int nh; bareudp = rcu_dereference_sk_user_data(sk); if (!bareudp) goto drop; if (skb->protocol == htons(ETH_P_IP)) family = AF_INET; else family = AF_INET6; if (bareudp->ethertype == htons(ETH_P_IP)) { __u8 ipversion; if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, sizeof(ipversion))) { DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } ipversion >>= 4; if (ipversion == 4) { proto = htons(ETH_P_IP); } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { struct iphdr *tunnel_hdr; tunnel_hdr = (struct iphdr *)skb_network_header(skb); if (tunnel_hdr->version == 4) { if (!ipv4_is_multicast(tunnel_hdr->daddr)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && ipv4_is_multicast(tunnel_hdr->daddr)) { proto = htons(ETH_P_MPLS_MC); } else { DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } } else { int addr_type; struct ipv6hdr *tunnel_hdr_v6; tunnel_hdr_v6 = (struct ipv6hdr *)skb_network_header(skb); addr_type = ipv6_addr_type((struct in6_addr *)&tunnel_hdr_v6->daddr); if (!(addr_type & IPV6_ADDR_MULTICAST)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && (addr_type & IPV6_ADDR_MULTICAST)) { proto = htons(ETH_P_MPLS_MC); } else { DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } } } else { proto = bareudp->ethertype; } if (iptunnel_pull_header(skb, BAREUDP_BASE_HLEN, proto, !net_eq(bareudp->net, dev_net(bareudp->dev)))) { DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); if (!tun_dst) { DEV_STATS_INC(bareudp->dev, rx_dropped); goto drop; } skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; skb_reset_mac_header(skb); /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(bareudp->dev, rx_length_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (!ipv6_mod_enabled() || family == AF_INET) err = IP_ECN_decapsulate(oiph, skb); else err = IP6_ECN_decapsulate(oiph, skb); if (unlikely(err)) { if (log_ecn_error) { if (!ipv6_mod_enabled() || family == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } if (err > 1) { DEV_STATS_INC(bareudp->dev, rx_frame_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } } len = skb->len; err = gro_cells_receive(&bareudp->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_sw_netstats_rx_add(bareudp->dev, len); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } static int bareudp_err_lookup(struct sock *sk, struct sk_buff *skb) { return 0; } static int bareudp_init(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = gro_cells_init(&bareudp->gro_cells, dev); if (err) { free_percpu(dev->tstats); return err; } return 0; } static void bareudp_uninit(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); gro_cells_destroy(&bareudp->gro_cells); free_percpu(dev->tstats); } static struct socket *bareudp_create_sock(struct net *net, __be16 port) { struct udp_port_cfg udp_conf; struct socket *sock; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6_mod_enabled()) udp_conf.family = AF_INET6; else udp_conf.family = AF_INET; udp_conf.local_udp_port = port; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static int bareudp_socket_create(struct bareudp_dev *bareudp, __be16 port) { struct udp_tunnel_sock_cfg tunnel_cfg; struct socket *sock; sock = bareudp_create_sock(bareudp->net, port); if (IS_ERR(sock)) return PTR_ERR(sock); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = bareudp; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = bareudp_udp_encap_recv; tunnel_cfg.encap_err_lookup = bareudp_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg); rcu_assign_pointer(bareudp->sock, sock); return 0; } static int bareudp_open(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int ret = 0; ret = bareudp_socket_create(bareudp, bareudp->port); return ret; } static void bareudp_sock_release(struct bareudp_dev *bareudp) { struct socket *sock; sock = bareudp->sock; rcu_assign_pointer(bareudp->sock, NULL); synchronize_net(); udp_tunnel_sock_release(sock); } static int bareudp_stop(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); bareudp_sock_release(bareudp); return 0; } static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; __be16 sport, df; int min_headroom; __u8 tos, ttl; __be32 saddr; int err; if (!skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; rt = ip_route_output_tunnel(skb, dev, bareudp->net, &saddr, info, IPPROTO_UDP, use_cache); if (IS_ERR(rt)) return PTR_ERR(rt); skb_tunnel_check_pmtu(skb, &rt->dst, BAREUDP_IPV4_HLEN + info->options_len, false); sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct iphdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; skb_set_inner_protocol(skb, bareudp->ethertype); udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, bareudp->port, !net_eq(bareudp->net, dev_net(bareudp->dev)), !(info->key.tun_flags & TUNNEL_CSUM)); return 0; free_dst: dst_release(&rt->dst); return err; } static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; struct in6_addr saddr, daddr; int min_headroom; __u8 prio, ttl; __be16 sport; int err; if (!skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; dst = ip6_dst_lookup_tunnel(skb, dev, bareudp->net, sock, &saddr, info, IPPROTO_UDP, use_cache); if (IS_ERR(dst)) return PTR_ERR(dst); skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len, false); sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct ipv6hdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; daddr = info->key.u.ipv6.dst; udp_tunnel6_xmit_skb(dst, sock->sk, skb, dev, &saddr, &daddr, prio, ttl, info->key.label, sport, bareudp->port, !(info->key.tun_flags & TUNNEL_CSUM)); return 0; free_dst: dst_release(dst); return err; } static bool bareudp_proto_valid(struct bareudp_dev *bareudp, __be16 proto) { if (bareudp->ethertype == proto) return true; if (!bareudp->multi_proto_mode) return false; if (bareudp->ethertype == htons(ETH_P_MPLS_UC) && proto == htons(ETH_P_MPLS_MC)) return true; if (bareudp->ethertype == htons(ETH_P_IP) && proto == htons(ETH_P_IPV6)) return true; return false; } static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); struct ip_tunnel_info *info = NULL; int err; if (!bareudp_proto_valid(bareudp, skb->protocol)) { err = -EINVAL; goto tx_error; } info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { err = -EINVAL; goto tx_error; } rcu_read_lock(); if (ipv6_mod_enabled() && info->mode & IP_TUNNEL_INFO_IPV6) err = bareudp6_xmit_skb(skb, dev, bareudp, info); else err = bareudp_xmit_skb(skb, dev, bareudp, info); rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; tx_error: dev_kfree_skb(skb); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static int bareudp_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); struct bareudp_dev *bareudp = netdev_priv(dev); bool use_cache; use_cache = ip_tunnel_dst_cache_usable(skb, info); if (!ipv6_mod_enabled() || ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; __be32 saddr; rt = ip_route_output_tunnel(skb, dev, bareudp->net, &saddr, info, IPPROTO_UDP, use_cache); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = saddr; } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct in6_addr saddr; struct socket *sock = rcu_dereference(bareudp->sock); dst = ip6_dst_lookup_tunnel(skb, dev, bareudp->net, sock, &saddr, info, IPPROTO_UDP, use_cache); if (IS_ERR(dst)) return PTR_ERR(dst); dst_release(dst); info->key.u.ipv6.src = saddr; } else { return -EINVAL; } info->key.tp_src = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); info->key.tp_dst = bareudp->port; return 0; } static const struct net_device_ops bareudp_netdev_ops = { .ndo_init = bareudp_init, .ndo_uninit = bareudp_uninit, .ndo_open = bareudp_open, .ndo_stop = bareudp_stop, .ndo_start_xmit = bareudp_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_fill_metadata_dst = bareudp_fill_metadata_dst, }; static const struct nla_policy bareudp_policy[IFLA_BAREUDP_MAX + 1] = { [IFLA_BAREUDP_PORT] = { .type = NLA_U16 }, [IFLA_BAREUDP_ETHERTYPE] = { .type = NLA_U16 }, [IFLA_BAREUDP_SRCPORT_MIN] = { .type = NLA_U16 }, [IFLA_BAREUDP_MULTIPROTO_MODE] = { .type = NLA_FLAG }, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type bareudp_type = { .name = "bareudp", }; /* Initialize the device structure. */ static void bareudp_setup(struct net_device *dev) { dev->netdev_ops = &bareudp_netdev_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &bareudp_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = ETH_DATA_LEN; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - BAREUDP_BASE_HLEN; dev->type = ARPHRD_NONE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; } static int bareudp_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) { NL_SET_ERR_MSG(extack, "Not enough attributes provided to perform the operation"); return -EINVAL; } return 0; } static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, struct netlink_ext_ack *extack) { memset(conf, 0, sizeof(*conf)); if (!data[IFLA_BAREUDP_PORT]) { NL_SET_ERR_MSG(extack, "port not specified"); return -EINVAL; } if (!data[IFLA_BAREUDP_ETHERTYPE]) { NL_SET_ERR_MSG(extack, "ethertype not specified"); return -EINVAL; } if (data[IFLA_BAREUDP_PORT]) conf->port = nla_get_u16(data[IFLA_BAREUDP_PORT]); if (data[IFLA_BAREUDP_ETHERTYPE]) conf->ethertype = nla_get_u16(data[IFLA_BAREUDP_ETHERTYPE]); if (data[IFLA_BAREUDP_SRCPORT_MIN]) conf->sport_min = nla_get_u16(data[IFLA_BAREUDP_SRCPORT_MIN]); if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; return 0; } static struct bareudp_dev *bareudp_find_dev(struct bareudp_net *bn, const struct bareudp_conf *conf) { struct bareudp_dev *bareudp, *t = NULL; list_for_each_entry(bareudp, &bn->bareudp_list, next) { if (conf->port == bareudp->port) t = bareudp; } return t; } static int bareudp_configure(struct net *net, struct net_device *dev, struct bareudp_conf *conf) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *t, *bareudp = netdev_priv(dev); int err; bareudp->net = net; bareudp->dev = dev; t = bareudp_find_dev(bn, conf); if (t) return -EBUSY; if (conf->multi_proto_mode && (conf->ethertype != htons(ETH_P_MPLS_UC) && conf->ethertype != htons(ETH_P_IP))) return -EINVAL; bareudp->port = conf->port; bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; err = register_netdevice(dev); if (err) return err; list_add(&bareudp->next, &bn->bareudp_list); return 0; } static int bareudp_link_config(struct net_device *dev, struct nlattr *tb[]) { int err; if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err) return err; } return 0; } static void bareudp_dellink(struct net_device *dev, struct list_head *head) { struct bareudp_dev *bareudp = netdev_priv(dev); list_del(&bareudp->next); unregister_netdevice_queue(dev, head); } static int bareudp_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bareudp_conf conf; int err; err = bareudp2info(data, &conf, extack); if (err) return err; err = bareudp_configure(net, dev, &conf); if (err) return err; err = bareudp_link_config(dev, tb); if (err) goto err_unconfig; return 0; err_unconfig: bareudp_dellink(dev, NULL); return err; } static size_t bareudp_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_PORT */ nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ 0; } static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); if (nla_put_be16(skb, IFLA_BAREUDP_PORT, bareudp->port)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_BAREUDP_ETHERTYPE, bareudp->ethertype)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BAREUDP_SRCPORT_MIN, bareudp->sport_min)) goto nla_put_failure; if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops bareudp_link_ops __read_mostly = { .kind = "bareudp", .maxtype = IFLA_BAREUDP_MAX, .policy = bareudp_policy, .priv_size = sizeof(struct bareudp_dev), .setup = bareudp_setup, .validate = bareudp_validate, .newlink = bareudp_newlink, .dellink = bareudp_dellink, .get_size = bareudp_get_size, .fill_info = bareudp_fill_info, }; struct net_device *bareudp_dev_create(struct net *net, const char *name, u8 name_assign_type, struct bareudp_conf *conf) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; int err; memset(tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &bareudp_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; err = bareudp_configure(net, dev, conf); if (err) { free_netdev(dev); return ERR_PTR(err); } err = dev_set_mtu(dev, IP_MAX_MTU - BAREUDP_BASE_HLEN); if (err) goto err; err = rtnl_configure_link(dev, NULL); if (err < 0) goto err; return dev; err: bareudp_dellink(dev, NULL); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(bareudp_dev_create); static __net_init int bareudp_init_net(struct net *net) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); INIT_LIST_HEAD(&bn->bareudp_list); return 0; } static void bareudp_destroy_tunnels(struct net *net, struct list_head *head) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *bareudp, *next; list_for_each_entry_safe(bareudp, next, &bn->bareudp_list, next) unregister_netdevice_queue(bareudp->dev, head); } static void __net_exit bareudp_exit_batch_net(struct list_head *net_list) { struct net *net; LIST_HEAD(list); rtnl_lock(); list_for_each_entry(net, net_list, exit_list) bareudp_destroy_tunnels(net, &list); /* unregister the devices gathered above */ unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations bareudp_net_ops = { .init = bareudp_init_net, .exit_batch = bareudp_exit_batch_net, .id = &bareudp_net_id, .size = sizeof(struct bareudp_net), }; static int __init bareudp_init_module(void) { int rc; rc = register_pernet_subsys(&bareudp_net_ops); if (rc) goto out1; rc = rtnl_link_register(&bareudp_link_ops); if (rc) goto out2; return 0; out2: unregister_pernet_subsys(&bareudp_net_ops); out1: return rc; } late_initcall(bareudp_init_module); static void __exit bareudp_cleanup_module(void) { rtnl_link_unregister(&bareudp_link_ops); unregister_pernet_subsys(&bareudp_net_ops); } module_exit(bareudp_cleanup_module); MODULE_ALIAS_RTNL_LINK("bareudp"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Varghese <martin.varghese@nokia.com>"); MODULE_DESCRIPTION("Interface driver for UDP encapsulated traffic");
11 85 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct skb_array' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * Limited-size FIFO of skbs. Can be used more or less whenever * sk_buff_head can be used, except you need to know the queue size in * advance. * Implemented as a type-safe wrapper around ptr_ring. */ #ifndef _LINUX_SKB_ARRAY_H #define _LINUX_SKB_ARRAY_H 1 #ifdef __KERNEL__ #include <linux/ptr_ring.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #endif struct skb_array { struct ptr_ring ring; }; /* Might be slightly faster than skb_array_full below, but callers invoking * this in a loop must use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_full(struct skb_array *a) { return __ptr_ring_full(&a->ring); } static inline bool skb_array_full(struct skb_array *a) { return ptr_ring_full(&a->ring); } static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce(&a->ring, skb); } static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_irq(&a->ring, skb); } static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_bh(&a->ring, skb); } static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_any(&a->ring, skb); } /* Might be slightly faster than skb_array_empty below, but only safe if the * array is never resized. Also, callers invoking this in a loop must take care * to use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_empty(struct skb_array *a) { return __ptr_ring_empty(&a->ring); } static inline struct sk_buff *__skb_array_peek(struct skb_array *a) { return __ptr_ring_peek(&a->ring); } static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); } static inline bool skb_array_empty_bh(struct skb_array *a) { return ptr_ring_empty_bh(&a->ring); } static inline bool skb_array_empty_irq(struct skb_array *a) { return ptr_ring_empty_irq(&a->ring); } static inline bool skb_array_empty_any(struct skb_array *a) { return ptr_ring_empty_any(&a->ring); } static inline struct sk_buff *__skb_array_consume(struct skb_array *a) { return __ptr_ring_consume(&a->ring); } static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); } static inline int skb_array_consume_batched(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } static inline int skb_array_consume_batched_irq(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } static inline int skb_array_consume_batched_any(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } static inline int skb_array_consume_batched_bh(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); } static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { int len = skb->len; if (skb_vlan_tag_present(skb)) len += VLAN_HLEN; return len; } else { return 0; } } static inline int skb_array_peek_len(struct skb_array *a) { return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_irq(struct skb_array *a) { return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_bh(struct skb_array *a) { return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_any(struct skb_array *a) { return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_init(&a->ring, size, gfp); } static void __skb_array_destroy_skb(void *ptr) { kfree_skb(ptr); } static inline void skb_array_unconsume(struct skb_array *a, struct sk_buff **skbs, int n) { ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); } static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } static inline int skb_array_resize_multiple(struct skb_array **rings, int nrings, unsigned int size, gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp, __skb_array_destroy_skb); } static inline void skb_array_cleanup(struct skb_array *a) { ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); } #endif /* _LINUX_SKB_ARRAY_H */
90 92 1 1 1 1 574 571 575 577 387 82 340 389 387 412 248 7 7 7 191 293 43 43 43 43 43 43 43 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_bridge.h> #include <net/netfilter/nf_log.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/sysctl.h> #include <net/route.h> #include <net/ip.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <linux/ipv6.h> #include <linux/in6.h> #include <net/ipv6.h> #include <net/inet_frag.h> static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL __printf(4, 5) void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...) { struct net *net = state->net; struct va_format vaf; va_list args; if (net->ct.sysctl_log_invalid != protonum && net->ct.sysctl_log_invalid != IPPROTO_RAW) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; nf_log_packet(net, state->pf, 0, skb, state->in, state->out, NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); __printf(4, 5) void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...) { struct va_format vaf; struct net *net; va_list args; net = nf_ct_net(ct); if (likely(net->ct.sysctl_log_invalid == 0)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; nf_l4proto_log_invalid(skb, state, nf_ct_protonum(ct), "%pV", &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid); #endif const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) { switch (l4proto) { case IPPROTO_UDP: return &nf_conntrack_l4proto_udp; case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp; case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp; #ifdef CONFIG_NF_CT_PROTO_DCCP case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp; #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp; #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: return &nf_conntrack_l4proto_udplite; #endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return &nf_conntrack_l4proto_gre; #endif #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return &nf_conntrack_l4proto_icmpv6; #endif /* CONFIG_IPV6 */ } return &nf_conntrack_l4proto_generic; }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; help = nfct_help(ct); if (help) { const struct nf_conntrack_helper *helper; int ret; /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (helper) { ret = helper->help(skb, protoff, ct, ctinfo); if (ret != NF_ACCEPT) return ret; } } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } } /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb); } EXPORT_SYMBOL_GPL(nf_confirm); static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) return nf_conntrack_confirm(skb); return nf_confirm(skb, skb_network_offset(skb) + ip_hdrlen(skb), ct, ctinfo); } static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return nf_conntrack_in(skb, state); } static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ enum ip_conntrack_info ctinfo; struct nf_conn *tmpl; tmpl = nf_ct_get(skb, &ctinfo); if (tmpl && nf_ct_is_template(tmpl)) { /* when skipping ct, clear templates to avoid fooling * later targets/matches */ skb->_nfct = 0; nf_ct_put(tmpl); } return NF_ACCEPT; } return nf_conntrack_in(skb, state); } /* Connection tracking may drop packets, but never alters them, so * make it the first hook. */ static const struct nf_hook_ops ipv4_conntrack_ops[] = { { .hook = ipv4_conntrack_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, }; /* Fast function for those who don't want to parse /proc (and I don't * blame them). * Reversing the socket's dst/src point of view gives us the reply * mapping. */ static int getorigdst(struct sock *sk, int optval, void __user *user, int *len) { const struct inet_sock *inet = inet_sk(sk); const struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; memset(&tuple, 0, sizeof(tuple)); lock_sock(sk); tuple.src.u3.ip = inet->inet_rcv_saddr; tuple.src.u.tcp.port = inet->inet_sport; tuple.dst.u3.ip = inet->inet_daddr; tuple.dst.u.tcp.port = inet->inet_dport; tuple.src.l3num = PF_INET; tuple.dst.protonum = sk->sk_protocol; release_sock(sk); /* We only do TCP and SCTP at the moment: is there a better way? */ if (tuple.dst.protonum != IPPROTO_TCP && tuple.dst.protonum != IPPROTO_SCTP) { pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); return -ENOPROTOOPT; } if ((unsigned int)*len < sizeof(struct sockaddr_in)) { pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", *len, sizeof(struct sockaddr_in)); return -EINVAL; } h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); sin.sin_family = AF_INET; sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.u.tcp.port; sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.u3.ip; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", &sin.sin_addr.s_addr, ntohs(sin.sin_port)); nf_ct_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else return 0; } pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, .get_optmax = SO_ORIGINAL_DST + 1, .get = getorigdst, .owner = THIS_MODULE, }; #if IS_ENABLED(CONFIG_IPV6) static int ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) { struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; const struct ipv6_pinfo *inet6 = inet6_sk(sk); const struct inet_sock *inet = inet_sk(sk); const struct nf_conntrack_tuple_hash *h; struct sockaddr_in6 sin6; struct nf_conn *ct; __be32 flow_label; int bound_dev_if; lock_sock(sk); tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; tuple.src.u.tcp.port = inet->inet_sport; tuple.dst.u3.in6 = sk->sk_v6_daddr; tuple.dst.u.tcp.port = inet->inet_dport; tuple.dst.protonum = sk->sk_protocol; bound_dev_if = sk->sk_bound_dev_if; flow_label = inet6->flow_label; release_sock(sk); if (tuple.dst.protonum != IPPROTO_TCP && tuple.dst.protonum != IPPROTO_SCTP) return -ENOPROTOOPT; if (*len < 0 || (unsigned int)*len < sizeof(sin6)) return -EINVAL; h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (!h) { pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } ct = nf_ct_tuplehash_to_ctrack(h); sin6.sin6_family = AF_INET6; sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; memcpy(&sin6.sin6_addr, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, sizeof(sin6.sin6_addr)); nf_ct_put(ct); sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; } static struct nf_sockopt_ops so_getorigdst6 = { .pf = NFPROTO_IPV6, .get_optmin = IP6T_SO_ORIGINAL_DST, .get_optmax = IP6T_SO_ORIGINAL_DST + 1, .get = ipv6_getorigdst, .owner = THIS_MODULE, }; static unsigned int ipv6_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned char pnum = ipv6_hdr(skb)->nexthdr; __be16 frag_off; int protoff; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) return nf_conntrack_confirm(skb); protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return nf_conntrack_confirm(skb); } return nf_confirm(skb, protoff, ct, ctinfo); } static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return nf_conntrack_in(skb, state); } static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return nf_conntrack_in(skb, state); } static const struct nf_hook_ops ipv6_conntrack_ops[] = { { .hook = ipv6_conntrack_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK, }, { .hook = ipv6_conntrack_local, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK, }, { .hook = ipv6_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { .hook = ipv6_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST - 1, }, }; #endif static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto) { u8 nfproto = (unsigned long)_nfproto; if (nf_ct_l3num(ct) != nfproto) return 0; if (nf_ct_protonum(ct) == IPPROTO_TCP && ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED) { ct->proto.tcp.seen[0].td_maxwin = 0; ct->proto.tcp.seen[1].td_maxwin = 0; } return 0; } static struct nf_ct_bridge_info *nf_ct_bridge_info; static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); bool fixup_needed = false, retry = true; int err = 0; retry: mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { case NFPROTO_IPV4: cnet->users4++; if (cnet->users4 > 1) goto out_unlock; err = nf_defrag_ipv4_enable(net); if (err) { cnet->users4 = 0; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); if (err) cnet->users4 = 0; else fixup_needed = true; break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: cnet->users6++; if (cnet->users6 > 1) goto out_unlock; err = nf_defrag_ipv6_enable(net); if (err < 0) { cnet->users6 = 0; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); if (err) cnet->users6 = 0; else fixup_needed = true; break; #endif case NFPROTO_BRIDGE: if (!nf_ct_bridge_info) { if (!retry) { err = -EPROTO; goto out_unlock; } mutex_unlock(&nf_ct_proto_mutex); request_module("nf_conntrack_bridge"); retry = false; goto retry; } if (!try_module_get(nf_ct_bridge_info->me)) { err = -EPROTO; goto out_unlock; } cnet->users_bridge++; if (cnet->users_bridge > 1) goto out_unlock; err = nf_register_net_hooks(net, nf_ct_bridge_info->ops, nf_ct_bridge_info->ops_size); if (err) cnet->users_bridge = 0; else fixup_needed = true; break; default: err = -EPROTO; break; } out_unlock: mutex_unlock(&nf_ct_proto_mutex); if (fixup_needed) nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup, (void *)(unsigned long)nfproto, 0, 0); return err; } static void nf_ct_netns_do_put(struct net *net, u8 nfproto) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { case NFPROTO_IPV4: if (cnet->users4 && (--cnet->users4 == 0)) { nf_unregister_net_hooks(net, ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); nf_defrag_ipv4_disable(net); } break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: if (cnet->users6 && (--cnet->users6 == 0)) { nf_unregister_net_hooks(net, ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); nf_defrag_ipv6_disable(net); } break; #endif case NFPROTO_BRIDGE: if (!nf_ct_bridge_info) break; if (cnet->users_bridge && (--cnet->users_bridge == 0)) nf_unregister_net_hooks(net, nf_ct_bridge_info->ops, nf_ct_bridge_info->ops_size); module_put(nf_ct_bridge_info->me); break; } mutex_unlock(&nf_ct_proto_mutex); } static int nf_ct_netns_inet_get(struct net *net) { int err; err = nf_ct_netns_do_get(net, NFPROTO_IPV4); #if IS_ENABLED(CONFIG_IPV6) if (err < 0) goto err1; err = nf_ct_netns_do_get(net, NFPROTO_IPV6); if (err < 0) goto err2; return err; err2: nf_ct_netns_put(net, NFPROTO_IPV4); err1: #endif return err; } int nf_ct_netns_get(struct net *net, u8 nfproto) { int err; switch (nfproto) { case NFPROTO_INET: err = nf_ct_netns_inet_get(net); break; case NFPROTO_BRIDGE: err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE); if (err < 0) return err; err = nf_ct_netns_inet_get(net); if (err < 0) { nf_ct_netns_put(net, NFPROTO_BRIDGE); return err; } break; default: err = nf_ct_netns_do_get(net, nfproto); break; } return err; } EXPORT_SYMBOL_GPL(nf_ct_netns_get); void nf_ct_netns_put(struct net *net, uint8_t nfproto) { switch (nfproto) { case NFPROTO_BRIDGE: nf_ct_netns_do_put(net, NFPROTO_BRIDGE); fallthrough; case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); break; default: nf_ct_netns_do_put(net, nfproto); break; } } EXPORT_SYMBOL_GPL(nf_ct_netns_put); void nf_ct_bridge_register(struct nf_ct_bridge_info *info) { WARN_ON(nf_ct_bridge_info); mutex_lock(&nf_ct_proto_mutex); nf_ct_bridge_info = info; mutex_unlock(&nf_ct_proto_mutex); } EXPORT_SYMBOL_GPL(nf_ct_bridge_register); void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info) { WARN_ON(!nf_ct_bridge_info); mutex_lock(&nf_ct_proto_mutex); nf_ct_bridge_info = NULL; mutex_unlock(&nf_ct_proto_mutex); } EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister); int nf_conntrack_proto_init(void) { int ret; ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) return ret; #if IS_ENABLED(CONFIG_IPV6) ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) goto cleanup_sockopt; #endif return ret; #if IS_ENABLED(CONFIG_IPV6) cleanup_sockopt: nf_unregister_sockopt(&so_getorigdst); #endif return ret; } void nf_conntrack_proto_fini(void) { nf_unregister_sockopt(&so_getorigdst); #if IS_ENABLED(CONFIG_IPV6) nf_unregister_sockopt(&so_getorigdst6); #endif } void nf_conntrack_proto_pernet_init(struct net *net) { nf_conntrack_generic_init_net(net); nf_conntrack_udp_init_net(net); nf_conntrack_tcp_init_net(net); nf_conntrack_icmp_init_net(net); #if IS_ENABLED(CONFIG_IPV6) nf_conntrack_icmpv6_init_net(net); #endif #ifdef CONFIG_NF_CT_PROTO_DCCP nf_conntrack_dccp_init_net(net); #endif #ifdef CONFIG_NF_CT_PROTO_SCTP nf_conntrack_sctp_init_net(net); #endif #ifdef CONFIG_NF_CT_PROTO_GRE nf_conntrack_gre_init_net(net); #endif } module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); MODULE_ALIAS("ip_conntrack"); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL");
168 186 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <asm/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; unsigned int thoff; unsigned int inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[20]; struct nft_verdict verdict; }; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u64 *dreg, u64 val) { put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; void *priv; }; /** * enum nft_iter_type - nftables set iterator type * * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @flags: flags * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_LOOKUP_O_1: constant, O(1) * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) * @NFT_LOOKUP_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @privsize: function to return size of set private data * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, void *(*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext); void (*activate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); bool (*flush)(const struct net *net, const struct nft_set *set, void *priv); void (*remove)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); void * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(const struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @expr: stateful expression * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) { return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data * * This structure must be aligned to word size, otherwise atomic bitops * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; } __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->len += nft_set_ext_types[id].len + len; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, void *elem) { return elem + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @dump: function to dump parameters * @type: expression type * @validate: validate expression, called during loop detection * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX /** * struct nft_chain - nf_tables chain * * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum nft_chain_flags * @name: name of the chain */ struct nft_chain { struct nft_rule *__rcu *rules_gen_0; struct nft_rule *__rcu *rules_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule **rules_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @afinfo: address family info * @name: name of the table */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @key: keys that identify this object * @rhlhead: nft_objname_ht node * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; u8 family; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @dev_name: array of device names * @data: rhashtable and garbage collector * @ops: array of hooks */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @pkt: pktinfo currently processed * @basechain: base chain currently processed * @chain: chain currently processed * @rule: rule that was evaluated * @verdict: verdict given by rule * @type: event type (enum nft_trace_types) * @packet_dumped: packet headers sent in a previous traceinfo message * @trace: other struct members are initialised */ struct nft_traceinfo { const struct nft_pktinfo *pkt; const struct nft_base_chain *basechain; const struct nft_chain *chain; const struct nft_rule *rule; const struct nft_verdict *verdict; enum nft_trace_types type; bool packet_dumped; bool trace; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_chain *basechain); void nft_trace_notify(struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context * @data: internal information related to the transaction */ struct nft_trans { struct list_head list; struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; char data[]; }; struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) #define nft_trans_flow_rule(trans) \ (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) #define nft_trans_rule_bound(trans) \ (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) #define nft_trans_set_bound(trans) \ (((struct nft_trans_set *)trans->data)->bound) #define nft_trans_set_update(trans) \ (((struct nft_trans_set *)trans->data)->update) #define nft_trans_set_timeout(trans) \ (((struct nft_trans_set *)trans->data)->timeout) #define nft_trans_set_gc_int(trans) \ (((struct nft_trans_set *)trans->data)->gc_int) struct nft_trans_chain { struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; bool bound; u32 chain_id; }; #define nft_trans_chain(trans) \ (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ (((struct nft_trans_chain *)trans->data)->name) #define nft_trans_chain_stats(trans) \ (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) #define nft_trans_chain_bound(trans) \ (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) struct nft_trans_table { bool update; }; #define nft_trans_table_update(trans) \ (((struct nft_trans_table *)trans->data)->update) struct nft_trans_elem { struct nft_set *set; struct nft_set_elem elem; bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem(trans) \ (((struct nft_trans_elem *)trans->data)->elem) #define nft_trans_elem_set_bound(trans) \ (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_obj(trans) \ (((struct nft_trans_obj *)trans->data)->obj) #define nft_trans_obj_newobj(trans) \ (((struct nft_trans_obj *)trans->data)->newobj) #define nft_trans_obj_update(trans) \ (((struct nft_trans_obj *)trans->data)->update) struct nft_trans_flowtable { struct nft_flowtable *flowtable; bool update; struct list_head hook_list; u32 flags; }; #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) #define nft_trans_flowtable_update(trans) \ (((struct nft_trans_flowtable *)trans->data)->update) #define nft_trans_flowtable_hooks(trans) \ (((struct nft_trans_flowtable *)trans->data)->hook_list) #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; void *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_set_elem *elem); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; u64 tstamp; unsigned int base_seq; u8 validate_state; unsigned int gc_seq; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } static inline u64 nft_net_tstamp(const struct net *net) { return nft_pernet(net)->tstamp; } #endif /* _NET_NF_TABLES_H */
93 93 93 93 93 93 93 89 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 // SPDX-License-Identifier: GPL-2.0-only /* File: fs/xattr.c Extended attribute handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com> Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/evm.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> #include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) { while (*a_prefix && *a == *a_prefix) { a++; a_prefix++; } return *a_prefix ? NULL : a; } /* * In order to implement different sets of xattr operations for each xattr * prefix, a filesystem should create a null-terminated array of struct * xattr_handler (one for each prefix) and hang a pointer to it off of the * s_xattr field of the superblock. */ #define for_each_xattr_handler(handlers, handler) \ if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) /* * Find the xattr_handler with the matching prefix. */ static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { const struct xattr_handler **handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return ERR_PTR(-EIO); return ERR_PTR(-EOPNOTSUPP); } for_each_xattr_handler(handlers, handler) { const char *n; n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { if (!handler->prefix ^ !*n) { if (*n) continue; return ERR_PTR(-EINVAL); } *name = n; return handler; } } return ERR_PTR(-EOPNOTSUPP); } /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. */ static int xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, const char *name, int mask) { /* * We can never set or remove an extended attribute on a read-only * filesystem or on an immutable / append-only inode. */ if (mask & MAY_WRITE) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; /* * Updating an xattr will likely cause i_uid and i_gid * to be writen back improperly if their true value is * unknown to the vfs. */ if (HAS_UNMAPPED_ID(mnt_userns, inode)) return -EPERM; } /* * No restriction for security.* and system.* from the VFS. Decision * on these is left to the underlying filesystem / security module. */ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return 0; /* * The trusted.* namespace can only be accessed by privileged users. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { if (!capable(CAP_SYS_ADMIN)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; return 0; } /* * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(mnt_userns, inode)) return -EPERM; } return inode_permission(mnt_userns, inode, mask); } /* * Look for any handler that deals with the specified namespace. */ int xattr_supported_namespace(struct inode *inode, const char *prefix) { const struct xattr_handler **handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; size_t preflen; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return -EIO; return -EOPNOTSUPP; } preflen = strlen(prefix); for_each_xattr_handler(handlers, handler) { if (!strncmp(xattr_prefix(handler), prefix, preflen)) return 0; } return -EOPNOTSUPP; } EXPORT_SYMBOL(xattr_supported_namespace); int __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { const struct xattr_handler *handler; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ return handler->set(handler, mnt_userns, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * * @mnt_userns: user namespace of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * * This function requires the caller to lock the inode's i_mutex before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { error = __vfs_setxattr(mnt_userns, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, size, flags); } } else { if (unlikely(is_bad_inode(inode))) return -EIO; } if (error == -EAGAIN) { error = -EOPNOTSUPP; if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) fsnotify_xattr(dentry); } } return error; } /** * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * * @mnt_userns: user namespace of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); if (error) return error; error = security_inode_setxattr(mnt_userns, dentry, name, value, size, flags); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value, size, flags); out: return error; } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; const void *orig_value = value; int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { error = cap_convert_nscap(mnt_userns, dentry, &value, size); if (error < 0) return error; size = error; } retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } if (value != orig_value) kfree(value); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { len = security_inode_getsecurity(mnt_userns, inode, name, &buffer, false); goto out_noalloc; } len = security_inode_getsecurity(mnt_userns, inode, name, &buffer, true); if (len < 0) return len; if (size < len) { len = -ERANGE; goto out; } memcpy(value, buffer, len); out: kfree(buffer); out_noalloc: return len; } /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, * before retrieving the extended attribute. * * Returns the result of alloc, if failed, or the getxattr operation. */ ssize_t vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; error = xattr_permission(mnt_userns, inode, name, MAY_READ); if (error) return error; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; if (!value || (error > xattr_size)) { value = krealloc(*xattr_value, error + 1, flags); if (!value) return -ENOMEM; memset(value, 0, error + 1); } error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const struct xattr_handler *handler; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; return handler->get(handler, dentry, inode, name, value, size); } EXPORT_SYMBOL(__vfs_getxattr); ssize_t vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(mnt_userns, inode, name, MAY_READ); if (error) return error; error = security_inode_getxattr(dentry, name); if (error) return error; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; int ret = xattr_getsecurity(mnt_userns, inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ if (ret == -EOPNOTSUPP) goto nolsm; return ret; } nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { struct inode *inode = d_inode(dentry); ssize_t error; error = security_inode_listxattr(dentry); if (error) return error; if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } return error; } EXPORT_SYMBOL_GPL(vfs_listxattr); int __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); /** * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * * @mnt_userns: user namespace of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_removexattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); if (error) return error; error = security_inode_removexattr(mnt_userns, dentry, name); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_removexattr(mnt_userns, dentry, name); if (!error) { fsnotify_xattr(dentry); evm_inode_post_removexattr(dentry, name); } out: return error; } EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_removexattr_locked(mnt_userns, dentry, name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); /* * Extended attribute SET operations */ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; error = strncpy_from_user(ctx->kname->name, name, sizeof(ctx->kname->name)); if (error == 0 || error == sizeof(ctx->kname->name)) return -ERANGE; if (error < 0) return error; error = 0; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); if (IS_ERR(ctx->kvalue)) { error = PTR_ERR(ctx->kvalue); ctx->kvalue = NULL; } } return error; } static void setxattr_convert(struct user_namespace *mnt_userns, struct dentry *d, struct xattr_ctx *ctx) { if (ctx->size && ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), ctx->kvalue, ctx->size); } int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx) { setxattr_convert(mnt_userns, dentry, ctx); return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } static long setxattr(struct user_namespace *mnt_userns, struct dentry *d, const char __user *name, const void __user *value, size_t size, int flags) { struct xattr_name kname; struct xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, .kname = &kname, .flags = flags, }; int error; error = setxattr_copy(name, &ctx); if (error) return error; error = do_setxattr(mnt_userns, d, &ctx); kvfree(ctx.kvalue); return error; } static int path_setxattr(const char __user *pathname, const char __user *name, const void __user *value, size_t size, int flags, unsigned int lookup_flags) { struct path path; int error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { error = setxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size, flags); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattr(pathname, name, value, size, flags, 0); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { struct fd f = fdget(fd); int error = -EBADF; if (!f.file) return error; audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { error = setxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, name, value, size, flags); mnt_drop_write_file(f.file); } fdput(f); return error; } /* * Extended attribute GET operations */ static ssize_t getxattr(struct user_namespace *mnt_userns, struct dentry *d, const char __user *name, void __user *value, size_t size) { ssize_t error; void *kvalue = NULL; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) error = -ERANGE; if (error < 0) return error; if (size) { if (size > XATTR_SIZE_MAX) size = XATTR_SIZE_MAX; kvalue = kvzalloc(size, GFP_KERNEL); if (!kvalue) return -ENOMEM; } error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), kvalue, error); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(kvalue); return error; } static ssize_t path_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattr(pathname, name, value, size, 0); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { struct fd f = fdget(fd); ssize_t error = -EBADF; if (!f.file) return error; audit_file(f.file); error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, name, value, size); fdput(f); return error; } /* * Extended attribute LIST operations */ static ssize_t listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kvmalloc(size, GFP_KERNEL); if (!klist) return -ENOMEM; } error = vfs_listxattr(d, klist, size); if (error > 0) { if (size && copy_to_user(list, klist, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { /* The file system tried to returned a list bigger than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(klist); return error; } static ssize_t path_listxattr(const char __user *pathname, char __user *list, size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattr(pathname, list, size, 0); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { struct fd f = fdget(fd); ssize_t error = -EBADF; if (!f.file) return error; audit_file(f.file); error = listxattr(f.file->f_path.dentry, list, size); fdput(f); return error; } /* * Extended attribute REMOVE operations */ static long removexattr(struct user_namespace *mnt_userns, struct dentry *d, const char __user *name) { int error; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) error = -ERANGE; if (error < 0) return error; return vfs_removexattr(mnt_userns, d, kname); } static int path_removexattr(const char __user *pathname, const char __user *name, unsigned int lookup_flags) { struct path path; int error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { error = removexattr(mnt_user_ns(path.mnt), path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { return path_removexattr(pathname, name, LOOKUP_FOLLOW); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { return path_removexattr(pathname, name, 0); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { struct fd f = fdget(fd); int error = -EBADF; if (!f.file) return error; audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { error = removexattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } fdput(f); return error; } /* * Combine the results of the list() operation from every xattr_handler in the * list. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { for_each_xattr_handler(handlers, handler) { if (!handler->name || (handler->list && !handler->list(dentry))) continue; size += strlen(handler->name) + 1; } } else { char *buf = buffer; size_t len; for_each_xattr_handler(handlers, handler) { if (!handler->name || (handler->list && !handler->list(dentry))) continue; len = strlen(handler->name); if (len + 1 > buffer_size) return -ERANGE; memcpy(buf, handler->name, len + 1); buf += len + 1; buffer_size -= len + 1; } size = buf - buffer; } return size; } EXPORT_SYMBOL(generic_listxattr); /** * xattr_full_name - Compute full attribute name from suffix * * @handler: handler of the xattr_handler operation * @name: name passed to the xattr_handler operation * * The get and set xattr handler operations are called with the remainder of * the attribute name after skipping the handler's prefix: for example, "foo" * is passed to the get operation of a handler with prefix "user." to get * attribute "user.foo". The full name is still "there" in the name though. * * Note: the list xattr handler operation when called from the vfs is passed a * NULL name; some file systems use this operation internally, with varying * semantics. */ const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } EXPORT_SYMBOL(xattr_full_name); /* * Allocate new xattr and copy in the value; but leave the name to callers. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) return NULL; new_xattr = kvmalloc(len, GFP_KERNEL); if (!new_xattr) return NULL; new_xattr->size = size; memcpy(new_xattr->value, value, size); return new_xattr; } /* * xattr GET operation for in-memory/pseudo filesystems */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr; int ret = -ENODATA; spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { if (strcmp(name, xattr->name)) continue; ret = xattr->size; if (buffer) { if (size < xattr->size) ret = -ERANGE; else memcpy(buffer, xattr->value, xattr->size); } break; } spin_unlock(&xattrs->lock); return ret; } /** * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems * @xattrs: target simple_xattr list * @name: name of the extended attribute * @value: value of the xattr. If %NULL, will remove the attribute. * @size: size of the new xattr * @flags: %XATTR_{CREATE|REPLACE} * @removed_size: returns size of the removed xattr, -1 if none removed * * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; * otherwise, fails with -ENODATA. * * Returns 0 on success, -errno on failure. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags, ssize_t *removed_size) { struct simple_xattr *xattr; struct simple_xattr *new_xattr = NULL; int err = 0; if (removed_size) *removed_size = -1; /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) return -ENOMEM; new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { kvfree(new_xattr); return -ENOMEM; } } spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { if (!strcmp(name, xattr->name)) { if (flags & XATTR_CREATE) { xattr = new_xattr; err = -EEXIST; } else if (new_xattr) { list_replace(&xattr->list, &new_xattr->list); if (removed_size) *removed_size = xattr->size; } else { list_del(&xattr->list); if (removed_size) *removed_size = xattr->size; } goto out; } } if (flags & XATTR_REPLACE) { xattr = new_xattr; err = -ENODATA; } else { list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } out: spin_unlock(&xattrs->lock); if (xattr) { kfree(xattr->name); kvfree(xattr); } return err; } static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } static int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) { size_t len = strlen(name) + 1; if (*buffer) { if (*remaining_size < len) return -ERANGE; memcpy(*buffer, name, len); *buffer += len; } *remaining_size -= len; return 0; } /* * xattr LIST operation for in-memory/pseudo filesystems */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; ssize_t remaining_size = size; int err = 0; #ifdef CONFIG_FS_POSIX_ACL if (IS_POSIXACL(inode)) { if (inode->i_acl) { err = xattr_list_one(&buffer, &remaining_size, XATTR_NAME_POSIX_ACL_ACCESS); if (err) return err; } if (inode->i_default_acl) { err = xattr_list_one(&buffer, &remaining_size, XATTR_NAME_POSIX_ACL_DEFAULT); if (err) return err; } } #endif spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; } spin_unlock(&xattrs->lock); return err ? err : size - remaining_size; } /* * Adds an extended attribute to the list */ void simple_xattr_list_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { spin_lock(&xattrs->lock); list_add(&new_xattr->list, &xattrs->head); spin_unlock(&xattrs->lock); }
2128 47 3296 3296 2081 1663 7 2791 1681 1686 1368 319 1685 3 3 827 794 68 108 826 167 237 685 827 830 3 9 2 9 911 913 908 911 910 170 170 169 169 842 55 96 1 20 2 873 217 741 40 880 1 882 881 43 6 34 1058 1058 512 500 28 28 6 17 10 396 386 10 379 10 1 2 2 4 3 4 9 90 88 910 23 912 910 910 910 912 912 913 910 956 957 957 957 957 953 10 957 950 10 956 4 2 956 957 956 2 2 1 956 956 956 956 958 958 6 956 1 958 958 956 956 6 956 955 922 133 957 958 957 956 957 607 503 958 956 1 958 955 240 27 945 956 956 1 956 956 958 1 957 958 955 1 955 957 2 958 1 1 1 1 1 1 1 1 955 1 1 940 71 2 199 773 5 1 958 880 1 955 954 957 955 28 1 893 78 79 70 3 69 2 71 5 5 73 73 73 4 4 69 69 2 44 2 3 53 65 70 71 71 70 70 71 71 4 4 8 8 3 1 5 6 218 219 24 205 205 197 16 9 2 9 4 4 4 4 4 4 4 4 4 4 82 11 11 3 28 3 67 4 2 2 2 2 198 4 4 189 194 7 1 1 1 186 176 3 7 162 2 19 177 3 178 4 135 41 173 4 175 176 176 85 86 125 2 4 116 117 2 1 116 1 1 107 10 106 92 7 10 4 4 4 95 4 91 1 3 82 9 91 4 81 1 1 4 1 2 2 96 59 52 66 47 62 1 1 61 57 3 1 11 56 1 1 1 1 1 63 77 1 2 4 69 1 66 1 1 1 38 39 64 66 2 2 1 3 163 1 5 19 133 36 5 8 150 3 154 143 33 1 2 1 141 1 140 1 13 1 2 1 2 3 3 5 168 168 1 10 11 11 10 1 7 5 2 4 3 3 1 7 13 1 8 3 1 2 1 2 11 8 19 1 24 24 24 34 34 21 34 32 34 15 19 32 885 883 119 18 873 748 746 12 6 2 12 2 5 5 4 3 1 4 2 12 15 1 1 1 2 2 1 10 10 6 4 5 1 1 7 1 1 2 2 2 2 1 2 7 1 7 10 9 7 7 3 1 7 10 11 1 10 10 10 10 9 7 9 7 6 10 6 7 7 1 2 1 1 11 11 1 14 4 9 9 8 2 9 14 14 5 14 14 14 1 14 14 1 14 14 28 1 2 13 13 2 8 1 10 3 5 17 18 1 16 3 14 1 4 1 11 2 1 4 5 4 4 4 33 17 7 33 33 5 22 10 14 33 15 5 13 33 15 11 1 6 11 33 1 17 17 33 2 24 24 24 2 9 24 24 20 20 8 4 3 3 7 1 2 3 7 3 7 7 7 7 7 25 2 39 16 16 1 3 1 10 10 25 1 1 23 23 21 23 1682 2 1 1679 1683 1422 265 26 211 52 261 261 1426 271 171 171 1247 1248 5 3 1701 907 848 298 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Routing netlink socket interface: protocol independent part. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> #include <linux/bpf.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/udp.h> #include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 40 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; struct module *owner; unsigned int flags; struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock); int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { if (head && tail) { tail->next = defer_kfree_skb_list; defer_kfree_skb_list = head; } } EXPORT_SYMBOL(rtnl_kfree_skbs); void __rtnl_unlock(void) { struct sk_buff *head = defer_kfree_skb_list; defer_kfree_skb_list = NULL; mutex_unlock(&rtnl_mutex); while (head) { struct sk_buff *next = head->next; kfree_skb(head); cond_resched(); head = next; } } void rtnl_unlock(void) { /* This fellow will unlock it for us. */ netdev_run_todo(); } EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_is_locked); bool refcount_dec_and_rtnl_lock(refcount_t *r) { return refcount_dec_and_mutex_lock(r, &rtnl_mutex); } EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { int msgindex = msgtype - RTM_BASE; /* * msgindex < 0 implies someone tried to register a netlink * control code. msgindex >= RTM_NR_MSGTYPES may indicate that * the message type has not been added to linux/rtnetlink.h */ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); return msgindex; } static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { struct rtnl_link *link, *old; struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) goto unlock; /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } old = rtnl_dereference(tab[msgindex]); if (old) { link = kmemdup(old, sizeof(*old), GFP_KERNEL); if (!link) goto unlock; } else { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) goto unlock; } WARN_ON(link->owner && link->owner != owner); link->owner = owner; WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) link->dumpit = dumpit; link->flags |= flags; /* publish protocol:msgtype */ rcu_assign_pointer(tab[msgindex], link); ret = 0; if (old) kfree_rcu(old, rcu); unlock: rtnl_unlock(); return ret; } /** * rtnl_register_module - Register a rtnetlink message type * * @owner: module registering the hook (THIS_MODULE) * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Like rtnl_register, but for use by removable modules. */ int rtnl_register_module(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { return rtnl_register_internal(owner, protocol, msgtype, doit, dumpit, flags); } EXPORT_SYMBOL_GPL(rtnl_register_module); /** * rtnl_register - Register a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the * specified protocol family and message type is received. * * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { int err; err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, flags); if (err) pr_err("Unable to register rtnetlink message handler, " "protocol = %d, message type = %d\n", protocol, msgtype); } /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return -ENOENT; } link = rtnl_dereference(tab[msgindex]); rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * * Identical to calling rtnl_unregster() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return; } RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = rtnl_dereference(tab[msgindex]); if (!link) continue; rcu_assign_pointer(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); synchronize_net(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i, err; for (i = 0, handler = handlers; i < n; i++, handler++) { err = rtnl_register_internal(handler->owner, handler->protocol, handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { __rtnl_unregister_many(handlers, i); break; } } return err; } EXPORT_SYMBOL_GPL(__rtnl_register_many); void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i; for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) rtnl_unregister(handler->protocol, handler->msgtype); } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static LIST_HEAD(link_ops); static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) { const struct rtnl_link_ops *ops; list_for_each_entry(ops, &link_ops, list) { if (!strcmp(ops->kind, kind)) return ops; } return NULL; } /** * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * The caller must hold the rtnl_mutex. This function should be used * by drivers that create devices during module initialization. It * must be called before registering the devices. * * Returns 0 on success or a negative error code. */ int __rtnl_link_register(struct rtnl_link_ops *ops) { if (rtnl_link_ops_get(ops->kind)) return -EEXIST; /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); return 0; } EXPORT_SYMBOL_GPL(__rtnl_link_register); /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * Returns 0 on success or a negative error code. */ int rtnl_link_register(struct rtnl_link_ops *ops) { int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; rtnl_lock(); err = __rtnl_link_register(ops); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; LIST_HEAD(list_kill); for_each_netdev(net, dev) { if (dev->rtnl_link_ops == ops) ops->dellink(dev, &list_kill); } unregister_netdevice_many(&list_kill); } /** * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister * * The caller must hold the rtnl_mutex and guarantee net_namespace_list * integrity (hold pernet_ops_rwsem for writing to close the race * with setup_net() and cleanup_net()). */ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; for_each_net(net) { __rtnl_kill_links(net, ops); } list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { struct net *net; bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { if (net->dev_unreg_count > 0) { unregistering = true; break; } } if (!unregistering) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; size_t size = 0; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) goto out; ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); return size; } static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; size_t size; if (!ops) return 0; size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); size += rtnl_link_get_slave_info_data_size(dev); return size; } static LIST_HEAD(rtnl_af_ops); static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; ASSERT_RTNL(); list_for_each_entry(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } return NULL; } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Returns 0 on success or a negative error code. */ void rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); list_del_rcu(&ops->list); rtnl_unlock(); synchronize_rcu(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); static size_t rtnl_link_get_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } rcu_read_unlock(); return size; } static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; bool ret = false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) ret = true; rcu_read_unlock(); return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; struct nlattr *slave_data; int err; master_dev = netdev_master_upper_dev_get((struct net_device *) dev); if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); if (err < 0) goto err_cancel_slave_data; nla_nest_end(skb, slave_data); } return 0; err_cancel_slave_data: nla_nest_cancel(skb, slave_data); return err; } static int rtnl_link_info_fill(struct sk_buff *skb, const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; struct nlattr *data; int err; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) return err; } if (ops->fill_info) { data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } return 0; err_cancel_data: nla_nest_cancel(skb, data); return err; } static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *linkinfo; int err = -EMSGSIZE; linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; err = rtnl_link_info_fill(skb, dev); if (err < 0) goto err_cancel_link; err = rtnl_link_slave_info_fill(skb, dev); if (err < 0) goto err_cancel_link; nla_nest_end(skb, linkinfo); return 0; err_cancel_link: nla_nest_cancel(skb, linkinfo); out: return err; } int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { struct sock *rtnl = net->rtnl; return nlmsg_unicast(rtnl, skb, pid); } EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { struct sock *rtnl = net->rtnl; netlink_set_err(rtnl, 0, group, error); } EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct nlattr *mx; int i, valid = 0; /* nothing is dumped for dst_default_metrics, so just skip the loop */ if (metrics == dst_default_metrics.metrics) return 0; mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { if (i == RTAX_CC_ALGO - 1) { char tmp[TCP_CA_NAME_MAX], *name; name = tcp_ca_get_name_by_key(metrics[i], tmp); if (!name) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; if (!user_features) continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; } valid++; } } if (!valid) { nla_nest_cancel(skb, mx); return 0; } return nla_nest_end(skb, mx); nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { .rta_error = error, .rta_id = id, }; if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; ci.rta_clntref = atomic_read(&dst->__refcnt); } if (expires) { unsigned long clock; clock = jiffies_to_clock_t(abs(expires)); clock = min_t(unsigned long, clock, INT_MAX); ci.rta_expires = (expires > 0) ? clock : -clock; } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = dev->operstate; switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; case IF_OPER_TESTING: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) operstate = IF_OPER_DORMANT; break; } if (dev->operstate != operstate) { write_lock(&dev_base_lock); dev->operstate = operstate; write_unlock(&dev_base_lock); netdev_state_change(dev); } } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) { return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); } static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int flags = ifm->ifi_flags; /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; a->rx_bytes = b->rx_bytes; a->tx_bytes = b->tx_bytes; a->rx_errors = b->rx_errors; a->tx_errors = b->tx_errors; a->rx_dropped = b->rx_dropped; a->tx_dropped = b->tx_dropped; a->multicast = b->multicast; a->collisions = b->collisions; a->rx_length_errors = b->rx_length_errors; a->rx_over_errors = b->rx_over_errors; a->rx_crc_errors = b->rx_crc_errors; a->rx_frame_errors = b->rx_frame_errors; a->rx_fifo_errors = b->rx_fifo_errors; a->rx_missed_errors = b->rx_missed_errors; a->tx_aborted_errors = b->tx_aborted_errors; a->tx_carrier_errors = b->tx_carrier_errors; a->tx_fifo_errors = b->tx_fifo_errors; a->tx_heartbeat_errors = b->tx_heartbeat_errors; a->tx_window_errors = b->tx_window_errors; a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + nla_total_size(sizeof(struct ifla_vf_trust))); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { size += num_vfs * (nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_DROPPED */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } return size; } else return 0; } static size_t rtnl_port_size(const struct net_device *dev, u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ + nla_total_size(2); /* PORT_VDP_RESPONSE */ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + port_size; size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + vf_port_size * dev_num_vf(dev->dev.parent); else return port_self_size; } static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; size_t size; if (list_empty(&dev->name_node->list)) return 0; size = nla_total_size(0); list_for_each_entry(name_node, &dev->name_node->list, list) size += nla_total_size(ALTIFNAMSIZ); return size; } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); if (dev->proto_down_reason) size += nla_total_size(0) + nla_total_size(4); return size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ + nla_total_size(4) /* IFLA_WEIGHT */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *vf_ports; struct nlattr *vf_port; int vf; int err; vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) goto nla_put_failure; err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); if (err == -EMSGSIZE) goto nla_put_failure; if (err) { nla_nest_cancel(skb, vf_port); continue; } nla_nest_end(skb, vf_port); } nla_nest_end(skb, vf_ports); return 0; nla_put_failure: nla_nest_cancel(skb, vf_ports); return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *port_self; int err; port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); return 0; } static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { int err; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); if (err) return err; if (dev_num_vf(dev->dev.parent)) { err = rtnl_vf_ports_fill(skb, dev); if (err) return err; } return 0; } static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) { char name[IFNAMSIZ]; int err; err = dev_get_phys_port_name(dev, name, sizeof(name)); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; } static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { struct netdev_phys_item_id ppid = { }; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_stats64 *sp; struct nlattr *attr; attr = nla_reserve_64bit(skb, IFLA_STATS64, sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; sp = nla_data(attr); dev_get_stats(dev, sp); attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; copy_rtnl_link_stats(nla_data(attr), sp); return 0; } static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, struct nlattr *vfinfo, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; struct ifla_vf_guid node_guid; struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver * didn't report anything. */ ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; /* VLAN Protocol by default is 802.1Q */ ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); memset(&node_guid, 0, sizeof(node_guid)); memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = vf_trust.vf = node_guid.vf = port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; vf_vlan_info.qos = ivi.qos; vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), &node_guid) || nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), &port_guid)) goto nla_put_vf_failure; } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), &vf_vlan_info)) { nla_nest_cancel(skb, vfvlanlist); goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, vf_stats.tx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, vf_stats.multicast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } nla_nest_end(skb, vfstats); } nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); nla_put_vfinfo_failure: nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { struct nlattr *vfinfo; int i, num_vfs; if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) return 0; num_vfs = dev_num_vf(dev->dev.parent); if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) return -EMSGSIZE; if (!dev->netdev_ops->ndo_get_vf_config) return 0; vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) return -EMSGSIZE; } nla_nest_end(skb, vfinfo); return 0; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); map.mem_start = dev->mem_start; map.mem_end = dev->mem_end; map.base_addr = dev->base_addr; map.irq = dev->irq; map.dma = dev->dma; map.port = dev->if_port; if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; } static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; ASSERT_RTNL(); generic_xdp_prog = rtnl_dereference(dev->xdp_prog); if (!generic_xdp_prog) return 0; return generic_xdp_prog->aux->id; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, u32 (*get_prog_id)(struct net_device *dev)) { u32 curr_id; int err; curr_id = get_prog_id(dev); if (!curr_id) return 0; *prog_id = curr_id; err = nla_put_u32(skb, attr, curr_id); if (err) return err; if (*mode != XDP_ATTACHED_NONE) *mode = XDP_ATTACHED_MULTI; else *mode = tgt_mode; return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; u32 prog_id; int err; u8 mode; xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; prog_id = 0; mode = XDP_ATTACHED_NONE; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; } nla_nest_end(skb, xdp); return 0; err_cancel: nla_nest_cancel(skb, xdp); return err; } static u32 rtnl_get_event(unsigned long event) { u32 rtnl_event_type = IFLA_EVENT_NONE; switch (event) { case NETDEV_REBOOT: rtnl_event_type = IFLA_EVENT_REBOOT; break; case NETDEV_FEAT_CHANGE: rtnl_event_type = IFLA_EVENT_FEATURES; break; case NETDEV_BONDING_FAILOVER: rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; break; case NETDEV_NOTIFY_PEERS: rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; break; case NETDEV_RESEND_IGMP: rtnl_event_type = IFLA_EVENT_IGMP_RESEND; break; case NETDEV_CHANGEINFODATA: rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; break; default: break; } return rtnl_event_type; } static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) { const struct net_device *upper_dev; int ret = 0; rcu_read_lock(); upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); rcu_read_unlock(); return ret; } static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { int ifindex = dev_get_iflink(dev); if (force || dev->ifindex != ifindex) return nla_put_u32(skb, IFLA_LINK, ifindex); return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev) { char buf[IFALIASZ]; int ret; ret = dev_get_alias(dev, buf, sizeof(buf)); return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; } static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net, gfp_t gfp) { bool put_iflink = false; if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; put_iflink = true; } } return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; if (!af_ops->fill_link_af) continue; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there * was no data to be dumped. This is not an error, it * means we should trim the attribute header and * continue. */ if (err == -ENODATA) nla_nest_cancel(skb, af); else if (err < 0) return -EMSGSIZE; nla_nest_end(skb, af); } nla_nest_end(skb, af_spec); return 0; } static int rtnl_fill_alt_ifnames(struct sk_buff *skb, const struct net_device *dev) { struct netdev_name_node *name_node; int count = 0; list_for_each_entry(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; } return count; } static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *prop_list; int ret; prop_list = nla_nest_start(skb, IFLA_PROP_LIST); if (!prop_list) return -EMSGSIZE; ret = rtnl_fill_alt_ifnames(skb, dev); if (ret <= 0) goto nest_cancel; nla_nest_end(skb, prop_list); return 0; nest_cancel: nla_nest_cancel(skb, prop_list); return ret; } static int rtnl_fill_proto_down(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *pr; u32 preason; if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; preason = dev->proto_down_reason; if (!preason) return 0; pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); if (!pr) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { nla_nest_cancel(skb, pr); goto nla_put_failure; } nla_nest_end(skb, pr); return 0; nla_put_failure: return -EMSGSIZE; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) goto nla_put_failure; } if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) goto nla_put_failure; if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; if (new_ifindex && nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; rcu_read_unlock(); if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure; if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, dev_name(dev->dev.parent))) goto nla_put_failure; if (dev->dev.parent && dev->dev.parent->bus && nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, dev->dev.parent->bus->name)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to * allow 0-length string (needed to remove an alias). */ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, /* Unused, but we need to keep it here since user space could * fill it. It's also broken with regard to NLA_BINARY use in * combination with structs. */ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) { const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } return ops; } static bool link_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need * another invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool link_kind_filtered(const struct net_device *dev, const struct rtnl_link_ops *kind_ops) { if (kind_ops && dev->rtnl_link_ops != kind_ops) return true; return false; } static bool link_dump_filtered(struct net_device *dev, int master_idx, const struct rtnl_link_ops *kind_ops) { if (link_master_filtered(dev, master_idx) || link_kind_filtered(dev, kind_ops)) return true; return false; } /** * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. * @sk: netlink socket * @netnsid: network namespace identifier * * Returns the network namespace identified by netnsid on success or an error * pointer on failure. */ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } return net; } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, bool strict_check, struct nlattr **tb, struct netlink_ext_ack *extack) { int hdrlen; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); return -EINVAL; } if (ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); return -EINVAL; } return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); } /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's * what iproute2 < v3.9.0 used. * We can detect the old iproute2. Even including the IFLA_EXT_MASK * attribute, its netlink message is shorter than struct ifinfomsg. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; int h, s_h; int idx = 0, s_idx; struct net_device *dev; struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; const struct rtnl_link_ops *kind_ops = NULL; unsigned int flags = NLM_F_MULTI; int master_idx = 0; int netnsid = -1; int err, i; s_h = cb->args[0]; s_idx = cb->args[1]; err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) return err; goto walk_entries; } for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; /* new attributes should only be added with strict checking */ switch (i) { case IFLA_TARGET_NETNSID: netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(tgt_net); } break; case IFLA_EXT_MASK: ext_filter_mask = nla_get_u32(tb[i]); break; case IFLA_MASTER: master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: kind_ops = linkinfo_to_kind_ops(tb[i]); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); return -EINVAL; } } } if (master_idx || kind_ops) flags |= NLM_F_DUMP_FILTERED; walk_entries: for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (link_dump_filtered(dev, master_idx, kind_ops)) goto cont; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { if (likely(skb->len)) goto out; goto out_err; } cont: idx++; } } out: err = skb->len; out_err: cb->args[1] = idx; cb->args[0] = h; cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr) { const struct ifinfomsg *ifmp; const struct nlattr *attrs; size_t len; ifmp = nla_data(nla_peer); attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); len = nla_len(nla_peer) - sizeof(struct ifinfomsg); if (ifmp->ifi_index < 0) { NL_SET_ERR_MSG_ATTR(exterr, nla_peer, "ifindex can't be negative"); return -EINVAL; } return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; /* Examine the link attributes and figure out which * network namespace we are talking about. */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); else net = get_net(src_net); return net; } EXPORT_SYMBOL(rtnl_link_get_net); /* Figure out which network namespace we are talking about by * examining the link attributes in the following order: * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) { struct net *net; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); return net; } static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) { struct net *net; net = rtnl_link_get_net_by_nlattr(src_net, tb); if (IS_ERR(net)) return net; if (!netlink_ns_capable(skb, net->user_ns, cap)) { put_net(net); return ERR_PTR(-EPERM); } return net; } /* Verify that rtnetlink requests do not pass additional properties * potentially referring to different network namespaces. */ static int rtnl_ensure_unique_netns(struct nlattr *tb[], struct netlink_ext_ack *extack, bool netns_id_only) { if (netns_id_only) { if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) return 0; NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); return -EOPNOTSUPP; } if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; invalid_attr: NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); return -EINVAL; } static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (dev) { if (tb[IFLA_ADDRESS] && nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) return -EINVAL; if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; af_ops = rtnl_af_lookup(nla_type(af)); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) return -EOPNOTSUPP; if (af_ops->validate_link_af) { err = af_ops->validate_link_af(dev, af, extack); if (err < 0) return err; } } } return 0; } static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { const struct net_device_ops *ops = dev->netdev_ops; return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); } static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { if (dev->type != ARPHRD_INFINIBAND) return -EOPNOTSUPP; return handle_infiniband_guid(dev, ivt, guid_type); } static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; int err = -EINVAL; if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); if (ivm->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); if (err < 0) return err; } if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); if (ivv->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, ivv->qos, htons(ETH_P_8021Q)); if (err < 0) return err; } if (tb[IFLA_VF_VLAN_LIST]) { struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; struct nlattr *attr; int rem, len = 0; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_vlan) return err; nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) return -EOPNOTSUPP; ivvl[len] = nla_data(attr); len++; } if (len == 0) return -EINVAL; if (ivvl[0]->vf >= INT_MAX) return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); if (err < 0) return err; err = -EOPNOTSUPP; if (ops->ndo_set_vf_rate) err = ops->ndo_set_vf_rate(dev, ivt->vf, ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_rate) err = ops->ndo_set_vf_rate(dev, ivt->vf, ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); if (ivs->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, ivs->setting); if (err < 0) return err; } if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); if (ivl->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, ivl->link_state); if (err < 0) return err; } if (tb[IFLA_VF_RSS_QUERY_EN]) { struct ifla_vf_rss_query_en *ivrssq_en; err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); if (ivrssq_en->vf >= INT_MAX) return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); if (err < 0) return err; } if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); if (err < 0) return err; } if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); } return err; } static int do_set_master(struct net_device *dev, int ifindex, struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; } else { return -EOPNOTSUPP; } } if (ifindex) { upper_dev = __dev_get_by_index(dev_net(dev), ifindex); if (!upper_dev) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { return -EOPNOTSUPP; } } return 0; } static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, }; static int do_set_proto_down(struct net_device *dev, struct nlattr *nl_proto_down, struct nlattr *nl_proto_down_reason, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; const struct net_device_ops *ops = dev->netdev_ops; unsigned long mask = 0; u32 value; bool proto_down; int err; if (!ops->ndo_change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } if (nl_proto_down_reason) { err = nla_parse_nested_deprecated(pdreason, IFLA_PROTO_DOWN_REASON_MAX, nl_proto_down_reason, ifla_proto_down_reason_policy, NULL); if (err < 0) return err; if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); return -EINVAL; } value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); dev_change_proto_down_reason(dev, mask, value); } if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } err = dev_change_proto_down(dev, proto_down); if (err) return err; } return 0; } #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { const struct net_device_ops *ops = dev->netdev_ops; char ifname[IFNAMSIZ]; int err; err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { const char *pat = ifname[0] ? ifname : NULL; struct net *net; int new_ifindex; net = rtnl_link_get_net_capable(skb, dev_net(dev), tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } if (tb[IFLA_NEW_IFINDEX]) new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); else new_ifindex = 0; err = __dev_change_net_namespace(dev, net, pat, new_ifindex); put_net(net); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; if (!ops->ndo_set_config) { err = -EOPNOTSUPP; goto errout; } if (!netif_device_present(dev)) { err = -ENODEV; goto errout; } u_map = nla_data(tb[IFLA_MAP]); k_map.mem_start = (unsigned long) u_map->mem_start; k_map.mem_end = (unsigned long) u_map->mem_end; k_map.base_addr = (unsigned short) u_map->base_addr; k_map.irq = (unsigned char) u_map->irq; k_map.dma = (unsigned char) u_map->dma; k_map.port = (unsigned char) u_map->port; err = ops->ndo_set_config(dev, &k_map); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { struct sockaddr *sa; int len; len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; goto errout; } sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); err = dev_set_mac_address_user(dev, sa, extack); kfree(sa); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } /* * Interface selected by interface index but interface * name provided implies that a name change has been * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { err = dev_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (ifm->ifi_flags || ifm->ifi_change) { err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); if (err < 0) goto errout; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); err = dev_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); if (max_size > GSO_MAX_SIZE) { err = -EINVAL; goto errout; } if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); if (max_segs > GSO_MAX_SEGS) { err = -EINVAL; goto errout; } if (dev->gso_max_segs ^ max_segs) { dev->gso_max_segs = max_segs; status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, attr, ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_VF_PORTS]) { struct nlattr *port[IFLA_PORT_MAX+1]; struct nlattr *attr; int vf; int rem; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_port) goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { if (nla_type(attr) != IFLA_VF_PORT || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, attr, ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { err = -EOPNOTSUPP; goto errout; } vf = nla_get_u32(port[IFLA_PORT_VF]); err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], ifla_port_policy, NULL); if (err < 0) goto errout; err = -EOPNOTSUPP; if (ops->ndo_set_vf_port) err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); err = af_ops->set_link_af(dev, af, extack); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; u32 xdp_flags = 0; err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy, NULL); if (err < 0) goto errout; if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } if (xdp[IFLA_XDP_FLAGS]) { xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); if (xdp_flags & ~XDP_FLAGS_MASK) { err = -EINVAL; goto errout; } if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { err = -EINVAL; goto errout; } } if (xdp[IFLA_XDP_FD]) { int expected_fd = -1; if (xdp_flags & XDP_FLAGS_REPLACE) { if (!xdp[IFLA_XDP_EXPECTED_FD]) { err = -EINVAL; goto errout; } expected_fd = nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); } err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), expected_fd, xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } } errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) netdev_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } return err; } static struct net_device *rtnl_dev_get(struct net *net, struct nlattr *tb[]) { char ifname[ALTIFNAMSIZ]; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else if (tb[IFLA_ALT_IFNAME]) nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); else return NULL; return __dev_get_by_name(net, ifname); } static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; int err; struct nlattr *tb[IFLA_MAX+1]; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) goto errout; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) goto errout; err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else goto errout; if (dev == NULL) { err = -ENODEV; goto errout; } err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; } static int rtnl_group_dellink(const struct net *net, int group) { struct net_device *dev, *aux; LIST_HEAD(list_kill); bool found = false; if (!group) return -EPERM; for_each_netdev(net, dev) { if (dev->group == group) { const struct rtnl_link_ops *ops; found = true; ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; } } if (!found) return -ENODEV; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { const struct rtnl_link_ops *ops; ops = dev->rtnl_link_ops; ops->dellink(dev, &list_kill); } } unregister_netdevice_many(&list_kill); return 0; } int rtnl_delete_link(struct net_device *dev) { const struct rtnl_link_ops *ops; LIST_HEAD(list_kill); ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return 0; } EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else goto out; if (!dev) { if (tb[IFLA_IFNAME] || ifm->ifi_index > 0) err = -ENODEV; goto out; } err = rtnl_delete_link(dev); out: if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int old_flags; int err; old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), NULL); if (err < 0) return err; } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags)); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; __dev_notify_flags(dev, old_flags, ~0U); } return 0; } EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); if (tb[IFLA_NUM_RX_QUEUES]) num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); if (num_tx_queues < 1 || num_tx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); } if (num_rx_queues < 1 || num_rx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); } if (ops->alloc) { dev = ops->alloc(tb, ifname, name_assign_type, num_tx_queues, num_rx_queues); if (IS_ERR(dev)) return dev; } else { dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); } if (!dev) return ERR_PTR(-ENOMEM); err = validate_linkmsg(dev, tb, extack); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); err = dev_validate_mtu(dev, mtu, extack); if (err) { free_netdev(dev); return ERR_PTR(err); } dev->mtu = mtu; } if (tb[IFLA_ADDRESS]) { memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } EXPORT_SYMBOL(rtnl_create_link); static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, int group, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = do_setlink(skb, dev, ifm, extack, tb, 0); if (err < 0) return err; } } return 0; } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attr, struct netlink_ext_ack *extack) { struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; const struct rtnl_link_ops *m_ops; struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; struct nlattr *tb[IFLA_MAX + 1]; struct net *dest_net, *link_net; struct nlattr **slave_data; char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr **data; bool link_specified; int err; #ifdef CONFIG_MODULES replay: #endif err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(net, tb); } else { link_specified = false; dev = NULL; } master_dev = NULL; m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) m_ops = master_dev->rtnl_link_ops; } err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (tb[IFLA_LINKINFO]) { err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); if (err < 0) return err; } else memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; ops = NULL; } data = NULL; if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) return -EINVAL; if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested_deprecated(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (err < 0) return err; data = attr; } if (ops->validate) { err = ops->validate(tb, data, extack); if (err < 0) return err; } } slave_data = NULL; if (m_ops) { if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) return -EINVAL; if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { err = nla_parse_nested_deprecated(slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; slave_data = slave_attr; } } if (dev) { int status = 0; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) return -EOPNOTSUPP; err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { if (!m_ops || !m_ops->slave_changelink) return -EOPNOTSUPP; err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } return do_setlink(skb, dev, ifm, extack, tb, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ if (link_specified) return -ENODEV; if (tb[IFLA_GROUP]) return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); return -ENODEV; } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { #ifdef CONFIG_MODULES if (kind[0]) { __rtnl_unlock(); request_module("rtnl-link-%s", kind); rtnl_lock(); ops = rtnl_link_ops_get(kind); if (ops) goto replay; } #endif NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (tb[IFLA_IFNAME]) { nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); } else { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); name_assign_type = NET_NAME_ENUM; } dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); link_net = get_net_ns_by_id(dest_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); err = -EINVAL; goto out; } err = -EPERM; if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) goto out; } else { link_net = NULL; } dev = rtnl_create_link(link_net ? : dest_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; } dev->ifindex = ifm->ifi_index; if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); else err = register_netdevice(dev); if (err < 0) { free_netdev(dev); goto out; } err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; if (link_net) { err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } out: if (link_net) put_net(link_net); put_net(dest_net); return err; out_unregister: if (ops->newlink) { LIST_HEAD(list_kill); ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); } else { unregister_netdevice(dev); } goto out; } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr **attr; int ret; attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; ret = __rtnl_newlink(skb, nlh, attr, extack); kfree(attr); return ret; } static int rtnl_valid_getlink_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; for (i = 0; i <= IFLA_MAX; i++) { if (!tb[i]) continue; switch (i) { case IFLA_IFNAME: case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); return -EINVAL; } } return 0; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; int netnsid = -1; int err; u32 ext_filter_mask = 0; err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else goto out; err = -ENODEV; if (dev == NULL) goto out; err = -ENOBUFS; nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); if (nskb == NULL) goto out; err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); out: if (netnsid >= 0) put_net(tgt_net); return err; } static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; if (cmd == RTM_NEWLINKPROP) { size = rtnl_prop_list_size(dev); size += nla_total_size(ALTIFNAMSIZ); if (size >= U16_MAX) { NL_SET_ERR_MSG(extack, "effective property list too long"); return -EINVAL; } } alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; if (cmd == RTM_NEWLINKPROP) { err = netdev_name_node_alt_create(dev, alt_ifname); if (!err) alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); } else { WARN_ON_ONCE(1); err = -EINVAL; } kfree(alt_ifname); if (!err) *changed = true; return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; struct ifinfomsg *ifm; bool changed = false; struct nlattr *attr; int err, rem; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else return -EINVAL; if (!dev) return -ENODEV; if (!tb[IFLA_PROP_LIST]) return 0; nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { switch (nla_type(attr)) { case IFLA_ALT_IFNAME: err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); if (err) return err; break; } } if (changed) netdev_state_change(dev); return 0; } static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); } static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; struct net_device *dev; int hdrlen; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } if (!ext_filter_mask) return NLMSG_GOODSIZE; /* * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { min_ifinfo_dump_size = max(min_ifinfo_dump_size, if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; int type = cb->nlh->nlmsg_type - RTM_BASE; int ret = 0; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; if (type < 0 || type >= RTM_NR_MSGTYPES) continue; tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); if (!tab) continue; link = rcu_dereference_rtnl(tab[type]); if (!link) continue; dumpit = link->dumpit; if (!dumpit) continue; if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } ret = dumpit(skb, cb); if (ret) break; } cb->family = idx; return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; size_t if_info_size; skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags); if (skb == NULL) goto errout; err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } return skb; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) { struct net *net = dev_net(dev); rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, new_ifindex); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL, 0); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, new_nsid, new_ifindex); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); /* Only return duplicate errors if NLM_F_EXCL is set */ if (err == -EEXIST && !(flags & NLM_F_EXCL)) err = 0; return err; } EXPORT_SYMBOL(ndo_dflt_fdb_add); static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } *p_vid = vid; return 0; } static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; u16 vid; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, extack); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } /* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_del); static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; __u8 *addr; int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { if (dev->netdev_ops->ndo_fdb_del) err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr, vid); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } static int nlmsg_populate_fdb(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, int *idx, struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha; int err; u32 portid, seq; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { if (*idx < cb->args[2]) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: *idx += 1; } return 0; } /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @skb: socket buffer to store message in * @cb: netlink callback * @dev: netdevice * @filter_dev: ignored * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { int err; if (dev->type != ARPHRD_ETHER) return -EINVAL; netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, extack); if (err < 0) return err; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_IFINDEX: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); return -EINVAL; } *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); break; case NDA_MASTER: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); return -EINVAL; } *br_idx = nla_get_u32(tb[NDA_MASTER]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); return -EINVAL; } } return 0; } static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. * So, check for ndmsg with an optional u32 attribute (not used here). * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ if (nlmsg_len(nlh) != sizeof(struct ndmsg) && (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { struct ifinfomsg *ifm; err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) *br_idx = nla_get_u32(tb[IFLA_MASTER]); } ifm = nlmsg_data(nlh); *brport_idx = ifm->ifi_index; } return 0; } static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; struct net_device *br_dev = NULL; const struct net_device_ops *ops = NULL; const struct net_device_ops *cops = NULL; struct net *net = sock_net(skb->sk); struct hlist_head *head; int brport_idx = 0; int br_idx = 0; int h, s_h; int idx = 0, s_idx; int err = 0; int fidx = 0; if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, cb->extack); else err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, cb->extack); if (err < 0) return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) return -ENODEV; ops = br_dev->netdev_ops; } s_h = cb->args[0]; s_idx = cb->args[1]; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (brport_idx && (dev->ifindex != brport_idx)) continue; if (!br_idx) { /* user did not specify a specific bridge */ if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && !(dev->priv_flags & IFF_EBRIDGE)) continue; cops = ops; } if (idx < s_idx) goto cont; if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, &fidx); if (err == -EMSGSIZE) goto out; } } if (dev->netdev_ops->ndo_fdb_dump) err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, &fidx); else err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); if (err == -EMSGSIZE) goto out; cops = NULL; /* reset fdb offset to 0 for rest of the interfaces */ cb->args[2] = 0; fidx = 0; cont: idx++; } } out: cb->args[0] = h; cb->args[1] = idx; cb->args[2] = fidx; return skb->len; } static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct nlattr **tb, u8 *ndm_flags, int *br_idx, int *brport_idx, u8 **addr, u16 *vid, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); return -EINVAL; } if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_MASTER: *br_idx = nla_get_u32(tb[i]); break; case NDA_LLADDR: if (nla_len(tb[i]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); return -EINVAL; } *addr = nla_data(tb[i]); break; case NDA_VLAN: err = fdb_vid_parse(tb[i], vid, extack); if (err) return err; break; case NDA_VNI: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); return -EINVAL; } } return 0; } static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net_device *dev = NULL, *br_dev = NULL; const struct net_device_ops *ops = NULL; struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct sk_buff *skb; int brport_idx = 0; u8 ndm_flags = 0; int br_idx = 0; u8 *addr = NULL; u16 vid = 0; int err; err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, &brport_idx, &addr, &vid, extack); if (err < 0) return err; if (!addr) { NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); return -EINVAL; } if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (br_idx) { if (dev) { NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); return -EINVAL; } br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) { NL_SET_ERR_MSG(extack, "Invalid master ifindex"); return -EINVAL; } ops = br_dev->netdev_ops; } if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } br_dev = netdev_master_upper_dev_get(dev); if (!br_dev) { NL_SET_ERR_MSG(extack, "Master of device not found"); return -EINVAL; } ops = br_dev->netdev_ops; } else { if (!(ndm_flags & NTF_SELF)) { NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); return -EINVAL; } ops = dev->netdev_ops; } } if (!br_dev && !dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -ENODEV; } if (!ops || !ops->ndo_fdb_get) { NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); return -EOPNOTSUPP; } skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (br_dev) dev = br_dev; err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); if (err) goto out; return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: kfree_skb(skb); return err; } static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { if (mask & flag) return nla_put_u8(skb, attrnum, !!(flags & flag)); return 0; } int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_BRIDGE; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (br_dev && nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } if (mode != BRIDGE_MODE_UNDEF) { if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } if (vlan_fill) { err = vlan_fill(skb, dev, filter_mask); if (err) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protinfo) goto nla_put_failure; if (brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING, BR_LEARNING) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROXYARP, BR_PROXYARP) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { nla_nest_cancel(skb, protinfo); goto nla_put_failure; } nla_nest_end(skb, protinfo); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return err ? err : -EMSGSIZE; } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, bool strict_check, u32 *filter_mask, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err, i; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } if (err < 0) return err; /* new attributes should only be added with strict checking */ for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case IFLA_EXT_MASK: *filter_mask = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); return -EINVAL; } } } return 0; } static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, cb->extack); if (err < 0 && cb->strict_check) return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = ops->ndo_bridge_getlink(skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } } err = skb->len; out_err: rcu_read_unlock(); cb->args[0] = idx; return err; } static inline size_t bridge_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ } static int rtnl_bridge_notify(struct net_device *dev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; if (!dev->netdev_ops->ndo_bridge_getlink) return 0; skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); if (!skb) { err = -ENOMEM; goto errout; } err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; /* Notification info is only filled for bridge ports, not the bridge * device itself. Therefore, a zero notification length is valid and * should not result in an error. */ if (!skb->len) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); if (err) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; br_flags_attr = attr; flags = nla_get_u16(attr); } if (nla_type(attr) == IFLA_BRIDGE_MODE) { if (nla_len(attr) < sizeof(u16)) return -EINVAL; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_setlink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (br_flags_attr) memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); break; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_dellink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); out: return err; } static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) { return (mask & IFLA_STATS_FILTER_BIT(attrid)) && (!idxattr || idxattr == attrid); } #define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) static int rtnl_get_offload_stats_attr_size(int attr_id) { switch (attr_id) { case IFLA_OFFLOAD_XSTATS_CPU_HIT: return sizeof(struct rtnl_link_stats64); } return 0; } static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, int *prividx) { struct nlattr *attr = NULL; int attr_id, size; void *attr_data; int err; if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats)) return -ENODATA; for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { if (attr_id < *prividx) continue; size = rtnl_get_offload_stats_attr_size(attr_id); if (!size) continue; if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; attr = nla_reserve_64bit(skb, attr_id, size, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) goto nla_put_failure; attr_data = nla_data(attr); memset(attr_data, 0, size); err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); if (err) goto get_offload_stats_failure; } if (!attr) return -ENODATA; *prividx = 0; return 0; nla_put_failure: err = -EMSGSIZE; get_offload_stats_failure: *prividx = attr_id; return err; } static int rtnl_get_offload_stats_size(const struct net_device *dev) { int nla_size = 0; int attr_id; int size; if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats)) return 0; for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; size = rtnl_get_offload_stats_attr_size(attr_id); nla_size += nla_total_size_64bit(size); } if (nla_size != 0) nla_size += nla_total_size(0); return nla_size; } static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, unsigned int filter_mask, int *idxattr, int *prividx) { struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; int err; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); if (!nlh) return -EMSGSIZE; ifsm = nlmsg_data(nlh); ifsm->family = PF_UNSPEC; ifsm->pad1 = 0; ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { struct rtnl_link_stats64 *sp; attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); if (!attr) goto nla_put_failure; sp = nla_data(attr); dev_get_stats(dev, sp); } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); if (!attr) goto nla_put_failure; err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, *idxattr)) { const struct rtnl_link_ops *ops = NULL; const struct net_device *master; master = netdev_master_upper_dev_get(dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); if (!attr) goto nla_put_failure; err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); if (!attr) goto nla_put_failure; err = rtnl_get_offload_stats(skb, dev, prividx); if (err == -ENODATA) nla_nest_cancel(skb, attr); else nla_nest_end(skb, attr); if (err && err != -ENODATA) goto nla_put_failure; *idxattr = 0; } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { struct rtnl_af_ops *af_ops; *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); if (!attr) goto nla_put_failure; rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; int err; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); if (err == -ENODATA) { nla_nest_cancel(skb, af); } else if (err < 0) { rcu_read_unlock(); goto nla_put_failure; } nla_nest_end(skb, af); } } rcu_read_unlock(); nla_nest_end(skb, attr); *idxattr = 0; } nlmsg_end(skb, nlh); return 0; nla_put_failure: /* not a multi message or no progress mean a real error */ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) nlmsg_cancel(skb, nlh); else nlmsg_end(skb, nlh); return -EMSGSIZE; } static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { struct net_device *_dev = (struct net_device *)dev; const struct rtnl_link_ops *ops = NULL; const struct net_device *master; /* netdev_master_upper_dev_get can't take const */ master = netdev_master_upper_dev_get(_dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->get_linkxstats_size) { int attr = IFLA_STATS_LINK_XSTATS_SLAVE; size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS_SLAVE */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) size += rtnl_get_offload_stats_size(dev); if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); /* for AF_* */ size += nla_total_size(0); } } rcu_read_unlock(); } return size; } static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { struct if_stats_msg *ifsm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } if (!strict_check) return 0; ifsm = nlmsg_data(nlh); /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); return -EINVAL; } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; } return 0; } static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; u32 filter_mask; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; filter_mask = ifsm->filter_mask; if (!filter_mask) return -EINVAL; nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, filter_mask, &idxattr, &prividx); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); } return err; } static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct hlist_head *head; struct net_device *dev; u32 filter_mask = 0; int idx = 0; s_h = cb->args[0]; s_idx = cb->args[1]; s_idxattr = cb->args[2]; s_prividx = cb->args[3]; cb->seq = net->dev_base_seq; err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); if (err) return err; ifsm = nlmsg_data(cb->nlh); filter_mask = ifsm->filter_mask; if (!filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (idx < s_idx) goto cont; err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, filter_mask, &s_idxattr, &s_prividx); /* If we ran out of room on the first message, * we're in trouble */ WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); if (err < 0) goto out; s_prividx = 0; s_idxattr = 0; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } out: cb->args[3] = s_prividx; cb->args[2] = s_idxattr; cb->args[1] = idx; cb->args[0] = h; return skb->len; } /* Process one rtnetlink message. */ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtnl_link *link; enum rtnl_kinds kind; struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; int family; int type; type = nlh->nlmsg_type; if (type > RTM_MAX) return -EOPNOTSUPP; type -= RTM_BASE; /* All the messages must have at least 1 byte length */ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; kind = type&3; if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; rcu_read_lock(); if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct sock *rtnl; rtnl_dumpit_func dumpit; u32 min_dump_alloc = 0; link = rtnl_get_link(family, type); if (!link || !link->dumpit) { family = PF_UNSPEC; link = rtnl_get_link(family, type); if (!link || !link->dumpit) goto err_unlock; } owner = link->owner; dumpit = link->dumpit; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); err = 0; /* need to do this before rcu_read_unlock() */ if (!try_module_get(owner)) err = -EPROTONOSUPPORT; rcu_read_unlock(); rtnl = net->rtnl; if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ module_put(owner); } return err; } link = rtnl_get_link(family, type); if (!link || !link->doit) { family = PF_UNSPEC; link = rtnl_get_link(PF_UNSPEC, type); if (!link || !link->doit) goto out_unlock; } owner = link->owner; if (!try_module_get(owner)) { err = -EPROTONOSUPPORT; goto out_unlock; } flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); module_put(owner); return err; } rcu_read_unlock(); rtnl_lock(); link = rtnl_get_link(family, type); if (link && link->doit) err = link->doit(skb, nlh, extack); rtnl_unlock(); module_put(owner); return err; out_unlock: rcu_read_unlock(); return err; err_unlock: rcu_read_unlock(); return -EOPNOTSUPP; } static void rtnetlink_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &rtnetlink_rcv_msg); } static int rtnetlink_bind(struct net *net, int group) { switch (group) { case RTNLGRP_IPV4_MROUTE_R: case RTNLGRP_IPV6_MROUTE_R: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; break; } return 0; } static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REBOOT: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_POST_TYPE_CHANGE: case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), GFP_KERNEL, NULL, 0); break; default: break; } return NOTIFY_DONE; } static struct notifier_block rtnetlink_dev_notifier = { .notifier_call = rtnetlink_event, }; static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; return 0; } static void __net_exit rtnetlink_net_exit(struct net *net) { netlink_kernel_release(net->rtnl); net->rtnl = NULL; } static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, }; void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo, 0); rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, 0); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CPUSET_H #define _LINUX_CPUSET_H /* * cpuset interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/sched/topology.h> #include <linux/sched/task.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/jump_label.h> #ifdef CONFIG_CPUSETS /* * Static branch rewrites can happen in an arbitrary order for a given * key. In code paths where we need to loop with read_mems_allowed_begin() and * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need * to ensure that begin() always gets rewritten before retry() in the * disabled -> enabled transition. If not, then if local irqs are disabled * around the loop, we can deadlock since retry() would always be * comparing the latest value of the mems_allowed seqcount against 0 as * begin() still would see cpusets_enabled() as false. The enabled -> disabled * transition should happen in reverse order for the same reasons (want to stop * looking at real value of mems_allowed.sequence in retry() first). */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); } static inline void cpuset_inc(void) { static_branch_inc_cpuslocked(&cpusets_pre_enable_key); static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec_cpuslocked(&cpusets_enabled_key); static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask); static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { if (cpusets_enabled()) return __cpuset_node_allowed(node, gfp_mask); return true; } static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return __cpuset_node_allowed(zone_to_nid(z), gfp_mask); } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { if (cpusets_enabled()) return __cpuset_zone_allowed(z, gfp_mask); return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2); #define cpuset_memory_pressure_bump() \ do { \ if (cpuset_memory_pressure_enabled) \ __cpuset_memory_pressure_bump(); \ } while (0) extern int cpuset_memory_pressure_enabled; extern void __cpuset_memory_pressure_bump(void); extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); extern int cpuset_slab_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { return task_spread_page(current); } static inline int cpuset_do_slab_mem_spread(void) { return task_spread_slab(current); } extern bool current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); /* * read_mems_allowed_begin is required when making decisions involving * mems_allowed such as during page allocation. mems_allowed can be updated in * parallel and depending on the new value an operation can fail potentially * causing process failure. A retry loop with read_mems_allowed_begin and * read_mems_allowed_retry prevents these artificial failures. */ static inline unsigned int read_mems_allowed_begin(void) { if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(&current->mems_allowed_seq); } /* * If this returns true, the operation that took place after * read_mems_allowed_begin may have failed artificially due to a concurrent * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ static inline bool read_mems_allowed_retry(unsigned int seq) { if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(&current->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { unsigned long flags; task_lock(current); local_irq_save(flags); write_seqcount_begin(&current->mems_allowed_seq); current->mems_allowed = nodemask; write_seqcount_end(&current->mems_allowed_seq); local_irq_restore(flags); task_unlock(current); } #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} static inline void cpuset_force_rebuild(void) { } static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_wait_for_hotplug(void) { } static inline void inc_dl_tasks_cs(struct task_struct *task) { } static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { cpumask_copy(mask, task_cpu_possible_mask(p)); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { return false; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; } #define cpuset_current_mems_allowed (node_states[N_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return 1; } static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { return true; } static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return 1; } static inline void cpuset_memory_pressure_bump(void) {} static inline void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { } static inline int cpuset_mem_spread_node(void) { return 0; } static inline int cpuset_slab_spread_node(void) { return 0; } static inline int cpuset_do_page_mem_spread(void) { return 0; } static inline int cpuset_do_slab_mem_spread(void) { return 0; } static inline bool current_cpuset_is_being_rebound(void) { return false; } static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_print_current_mems_allowed(void) { } static inline void set_mems_allowed(nodemask_t nodemask) { } static inline unsigned int read_mems_allowed_begin(void) { return 0; } static inline bool read_mems_allowed_retry(unsigned int seq) { return false; } #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */
686 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM fib6 #if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FIB6_H #include <linux/in6.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <linux/tracepoint.h> TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct fib6_result *res, struct fib6_table *table, const struct flowi6 *flp), TP_ARGS(net, res, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) __field( int, err ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) __array( __u8, src, 16 ) __array( __u8, dst, 16 ) __field( u16, sport ) __field( u16, dport ) __field( u8, proto ) __field( u8, rt_type ) __dynamic_array( char, name, IFNAMSIZ ) __array( __u8, gw, 16 ) ), TP_fast_assign( struct in6_addr *in6; __entry->tb_id = table->tb6_id; __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; in6 = (struct in6_addr *)__entry->src; *in6 = flp->saddr; in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; __entry->proto = flp->flowi6_proto; if (__entry->proto == IPPROTO_TCP || __entry->proto == IPPROTO_UDP) { __entry->sport = ntohs(flp->fl6_sport); __entry->dport = ntohs(flp->fl6_dport); } else { __entry->sport = 0; __entry->dport = 0; } if (res->nh && res->nh->fib_nh_dev) { __assign_str(name, res->nh->fib_nh_dev); } else { __assign_str(name, "-"); } if (res->f6i == net->ipv6.fib6_null_entry) { struct in6_addr in6_zero = {}; in6 = (struct in6_addr *)__entry->gw; *in6 = in6_zero; } else if (res->nh) { in6 = (struct in6_addr *)__entry->gw; *in6 = res->nh->fib_nh_gw6; } ), TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, __entry->tos, __entry->scope, __entry->flags, __get_str(name), __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 922 922 369 655 920 918 922 595 1378 978 836 595 89 89 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/dst.c Protocol independent destination cache. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> #include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } EXPORT_SYMBOL(dst_discard_out); const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ .refcnt = REFCOUNT_INIT(1), }; EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { dst->dev = dev; dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif dst->input = dst_discard; dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; dst->trailer_len = 0; #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif dst->lwtstate = NULL; atomic_set(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } EXPORT_SYMBOL(dst_init); void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; if (ops->gc && !(flags & DST_NOCOUNT) && dst_entries_get_fast(ops) > ops->gc_thresh) ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); return dst; } EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child = NULL; smp_rmb(); #ifdef CONFIG_XFRM if (dst->xfrm) { struct xfrm_dst *xdst = (struct xfrm_dst *) dst; child = xdst->child; } #endif if (dst->ops->destroy) dst->ops->destroy(dst); dev_put(dst->dev); lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) dst_release_immediate(dst); return NULL; } EXPORT_SYMBOL(dst_destroy); static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst = dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced * by dst: * 1. put the dst under blackhole interface and discard all tx/rx packets * on this route. * 2. release the net_device * This function should be called when removing routes from the fib tree * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to * make the next dst_ops->check() fail. */ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; dst->obsolete = DST_OBSOLETE_DEAD; if (dst->ops->ifdown) dst->ops->ifdown(dst, dev, true); dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } EXPORT_SYMBOL(dst_dev_put); static void dst_count_dec(struct dst_entry *dst) { if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); } void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt){ dst_count_dec(dst); call_rcu(&dst->rcu_head, dst_destroy_rcu); } } } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { if (dst) { int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt){ dst_count_dec(dst); dst_destroy(dst); } } } EXPORT_SYMBOL(dst_release_immediate); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; refcount_set(&p->refcnt, 1); memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; } else if (prev & DST_METRICS_REFCOUNTED) { if (refcount_dec_and_test(&old_p->refcnt)) kfree(old_p); } } BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); /* Caller asserts that dst_metrics_read_only(dst) is false. */ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); } EXPORT_SYMBOL(__dst_destroy_metrics_generic); struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie) { return NULL; } u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } EXPORT_SYMBOL_GPL(dst_blackhole_redirect); unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst->dev->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); static struct dst_ops dst_blackhole_ops = { .family = AF_UNSPEC, .neigh_lookup = dst_blackhole_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static void __metadata_dst_init(struct metadata_dst *md_dst, enum metadata_type type, u8 optslen) { struct dst_entry *dst; dst = &md_dst->dst; dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, DST_METADATA | DST_NOCOUNT); memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); md_dst->type = type; } struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags) { struct metadata_dst *md_dst; md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); if (!md_dst) return NULL; __metadata_dst_init(md_dst, type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc); void metadata_dst_free(struct metadata_dst *md_dst) { #ifdef CONFIG_DST_CACHE if (md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&md_dst->u.tun_info.dst_cache); #endif kfree(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) { int cpu; struct metadata_dst __percpu *md_dst; md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; for_each_possible_cpu(cpu) __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { #ifdef CONFIG_DST_CACHE int cpu; for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); if (one_md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); } #endif free_percpu(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 // SPDX-License-Identifier: GPL-2.0 /* * bio-integrity.c - bio data integrity extensions * * Copyright (C) 2007, 2008, 2009 Oracle Corporation * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ #include <linux/blkdev.h> #include <linux/mempool.h> #include <linux/export.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/slab.h> #include "blk.h" static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; void blk_flush_integrity(void) { flush_workqueue(kintegrityd_wq); } static void __bio_integrity_free(struct bio_set *bs, struct bio_integrity_payload *bip) { if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); } } /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to * @gfp_mask: Memory allocation mask * @nr_vecs: Number of integrity metadata scatter-gather elements * * Description: This function prepares a bio for attaching integrity * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { struct bio_integrity_payload *bip; struct bio_set *bs = bio->bi_pool; unsigned inline_vecs; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); inline_vecs = BIO_INLINE_VECS; } if (unlikely(!bip)) return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { bip->bip_max_vcnt = nr_vecs; bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; } bip->bip_bio = bio; bio->bi_integrity = bip; bio->bi_opf |= REQ_INTEGRITY; return bip; err: __bio_integrity_free(bs, bip); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed * * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update * @page: page containing integrity metadata * @len: number of bytes of integrity metadata in page * @offset: start offset within page * * Description: Attach a page containing integrity metadata to bio. */ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_vec *iv; if (bip->bip_vcnt >= bip->bip_max_vcnt) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; iv->bv_page = page; iv->bv_len = len; iv->bv_offset = offset; bip->bip_vcnt++; return len; } EXPORT_SYMBOL(bio_integrity_add_page); /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for * @proc_iter: iterator to process * @proc_fn: Pointer to the relevant processing function */ static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); iter.data_buf = kaddr; iter.data_size = bv.bv_len; ret = proc_fn(&iter); kunmap_local(kaddr); if (ret) break; } return ret; } /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * * Description: Checks if the bio already has an integrity payload attached. * If it does, the payload has been generated by another kernel subsystem, * and we just pass it through. Otherwise allocates integrity payload. * The bio must have data direction, target device and start sector set priot * to calling. In the WRITE case, integrity metadata will be generated using * the block device's integrity function. In the READ case, the buffer * will be prepared for DMA and a suitable end_io handler set up. */ bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); void *buf; unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; gfp_t gfp = GFP_NOIO; if (!bi) return true; if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return true; if (!bio_sectors(bio)) return true; /* Already protected? */ if (bio_integrity(bio)) return true; if (bio_data_dir(bio) == READ) { if (!bi->profile->verify_fn || !(bi->flags & BLK_INTEGRITY_VERIFY)) return true; } else { if (!bi->profile->generate_fn || !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; /* * Zero the memory allocated to not leak uninitialized kernel * memory to disk. For PI this only affects the app tag, but * for non-integrity metadata it affects the entire metadata * buffer. */ gfp |= __GFP_ZERO; } /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; } end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ((unsigned long) buf) >> PAGE_SHIFT; nr_pages = end - start; /* Allocate bio integrity payload and integrity vectors */ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); goto err_end_io; } bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) bip->bip_flags |= BIP_IP_CHECKSUM; /* Map it */ offset = offset_in_page(buf); for (i = 0 ; i < nr_pages ; i++) { int ret; bytes = PAGE_SIZE - offset; if (len <= 0) break; if (bytes > len) bytes = len; ret = bio_integrity_add_page(bio, virt_to_page(buf), bytes, offset); if (ret == 0) { printk(KERN_ERR "could not attach integrity payload\n"); goto err_end_io; } if (ret < bytes) break; buf += bytes; len -= bytes; offset = 0; } /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) { bio_integrity_process(bio, &bio->bi_iter, bi->profile->generate_fn); } else { bip->bio_iter = bio->bi_iter; } return true; err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; } EXPORT_SYMBOL(bio_integrity_prep); /** * bio_integrity_verify_fn - Integrity I/O completion worker * @work: Work struct stored in bio to be verified * * Description: This workqueue function is called to complete a READ * request. The function verifies the transferred integrity metadata * and then calls the original bio end_io function. */ static void bio_integrity_verify_fn(struct work_struct *work) { struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced * during split and completion, we need to rewind iterator to * it's original position. */ bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, bi->profile->verify_fn); bio_integrity_free(bio); bio_endio(bio); } /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio * * Description: Completion for integrity I/O * * Normally I/O completion is done in interrupt context. However, * verifying I/O integrity is a time-consuming task which must be run * in process context. This function postpones completion * accordingly. */ bool __bio_integrity_endio(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) { INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); return false; } bio_integrity_free(bio); return true; } /** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update * @bytes_done: number of data bytes that have been completed * * Description: This function calculates how many integrity bytes the * number of completed data bytes correspond to and advances the * integrity vector accordingly. */ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } /** * bio_integrity_trim - Trim integrity vector * @bio: bio whose integrity vector to update * * Description: Used to trim the integrity vector in a cloned bio. */ void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } EXPORT_SYMBOL(bio_integrity_trim); /** * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio * @gfp_mask: Memory allocation mask * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { struct bio_integrity_payload *bip_src = bio_integrity(bio_src); struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); if (IS_ERR(bip)) return PTR_ERR(bip); memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); bip->bip_vcnt = bip_src->bip_vcnt; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; return 0; } EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { if (mempool_initialized(&bs->bio_integrity_pool)) return 0; if (mempool_init_slab_pool(&bs->bio_integrity_pool, pool_size, bip_slab)) return -1; if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { mempool_exit(&bs->bio_integrity_pool); return -1; } return 0; } EXPORT_SYMBOL(bioset_integrity_create); void bioset_integrity_free(struct bio_set *bs) { mempool_exit(&bs->bio_integrity_pool); mempool_exit(&bs->bvec_integrity_pool); } void __init bio_integrity_init(void) { /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. */ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIO_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); }
4 4 2 9 17 17 16 16 16 16 16 16 3 3 3 3 4 10 10 10 3 2 8 8 5 5 3 2 4 5 5 10 10 7 7 7 14 13 14 14 14 14 2 2 2 2 9 9 3 2 2 9 9 3 3 3 3 3 2 1 1 1 1 1 7 7 7 7 7 7 903 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "soft-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" #include "originator.h" #include "send.h" #include "translation-table.h" /** * batadv_skb_head_push() - Increase header size and move (push) head pointer * @skb: packet buffer which should be modified * @len: number of bytes to add * * Return: 0 on success or negative error number in case of failure */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; /* TODO: We must check if we can release all references to non-payload * data using __skb_header_release in our skbs to allow skb_cow_header * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. */ result = skb_cow_head(skb, len); if (result < 0) return result; skb_push(skb, len); return 0; } static int batadv_interface_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static int batadv_interface_release(struct net_device *dev) { netif_stop_queue(dev); return 0; } /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the soft interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters */ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { counters = per_cpu_ptr(bat_priv->bat_counters, cpu); sum += counters[idx]; } return sum; } static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED); stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX); stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES); return stats; } static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); ether_addr_copy(dev->dev_addr, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return 0; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } rcu_read_unlock(); return 0; } static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; bat_priv->mtu_set_by_user = new_mtu; return 0; } /** * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman * soft interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) { } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct net_device *soft_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode = BATADV_FORW_SINGLE; struct batadv_orig_node *mcast_single_orig = NULL; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; switch (ntohs(proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, sizeof(*vhdr))) goto dropped; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; /* drop batman-in-batman packets to prevent loops */ if (proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } fallthrough; case ETH_P_BATMAN: goto dropped; } skb_set_network_header(skb, network_offset); if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) goto dropped; } /* Snoop address candidates from DHCPACKs for early DAT filling */ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... * * The same goes for ECTP sent at least by some Cisco Switches, * it might confuse the mesh when used with bridge loop avoidance. */ if (batadv_compare_eth(ethhdr->h_dest, stp_addr)) goto dropped; if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { do_bcast = true; goto send; } dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, chaddr); /* skb->data may have been modified by * batadv_gw_dhcp_recipient_get() */ ethhdr = eth_hdr(skb); /* if gw_mode is on, broadcast any non-DHCP message. * All the DHCP packets are going to be sent as unicast */ if (dhcp_rcp == BATADV_DHCP_NO) { do_bcast = true; goto send; } if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) dst_hint = chaddr; else if ((gw_mode == BATADV_GW_MODE_SERVER) && (dhcp_rcp == BATADV_DHCP_TO_SERVER)) /* gateways should not forward any DHCP message if * directed to a DHCP server */ goto dropped; send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, &mcast_single_orig, &mcast_is_routable); if (forw_mode == BATADV_FORW_NONE) goto dropped; if (forw_mode == BATADV_FORW_SINGLE || forw_mode == BATADV_FORW_SOME) do_bcast = false; } } batadv_skb_set_priority(skb, 0); /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto dropped; /* in case of ARP request, we do not immediately broadcasti the * packet, instead we first wait for DAT to try to retrieve the * correct ARP entry */ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) brd_delay = msecs_to_jiffies(ARP_REQ_DELAY); if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0) goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ ether_addr_copy(bcast_packet->orig, primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (mcast_single_orig) { ret = batadv_mcast_forw_send_orig(bat_priv, skb, vid, mcast_single_orig); } else if (forw_mode == BATADV_FORW_SOME) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) goto dropped; batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, vid); } if (ret != NET_XMIT_SUCCESS) goto dropped_freed; } batadv_inc_counter(bat_priv, BATADV_CNT_TX); batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len); goto end; dropped: kfree_skb(skb); dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: batadv_orig_node_put(mcast_single_orig); batadv_hardif_put(primary_if); return NETDEV_TX_OK; } /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @soft_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @soft_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. * * The packet may still get dropped. This can happen when the encapsulated * ethernet frame is invalid or contains again an batman-adv packet. Also * unicast packets will be dropped directly when it was sent between two * isolated clients. */ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; packet_type = batadv_bcast_packet->packet_type; skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); /* clean the netfilter state now that the batman-adv header has been * removed */ nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) goto dropped; vhdr = skb_vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; fallthrough; case ETH_P_BATMAN: goto dropped; } /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, soft_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* set the mark on broadcast packets if AP isolation is ON and * the packet is coming from an "isolated" client */ if (batadv_vlan_ap_isola_get(bat_priv, vid) && batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, vid)) { /* save bits in skb->mark not covered by the mask and * apply the mark on the rest */ skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; } netif_rx(skb); goto out; dropped: kfree_skb(skb); out: return; } /** * batadv_softif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; vlan = container_of(ref, struct batadv_softif_vlan, refcount); spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_softif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan; spin_lock_bh(&bat_priv->softif_vlan_list_lock); vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; } vlan->bat_priv = bat_priv; vlan->vid = vid; kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); /* don't return reference to new softif_vlan */ batadv_softif_vlan_put(vlan); return 0; } /** * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object * @bat_priv: the bat priv with all the soft interface information * @vlan: the object to remove */ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that * it was not deleted yet. batadv_softif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return batadv_softif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ batadv_softif_vlan_put(vlan); return 0; } /* batman-adv network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL */ static void batadv_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); } /** * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify * * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); atomic_set(&bat_priv->tt.local_changes, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif atomic_set(&bat_priv->tp_num, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; batadv_nc_init_bat_priv(bat_priv); if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) goto free_bat_counters; } ret = batadv_mesh_init(dev); if (ret < 0) goto free_bat_counters; return 0; free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; return ret; } /** * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); out: batadv_hardif_put(hard_iface); return ret; } /** * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); ret = 0; out: batadv_hardif_put(hard_iface); return ret; } static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_softif_init_late, .ndo_open = batadv_interface_open, .ndo_stop = batadv_interface_release, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, .ndo_set_mac_address = batadv_interface_set_mac_addr, .ndo_change_mtu = batadv_interface_change_mtu, .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, .ndo_add_slave = batadv_softif_slave_add, .ndo_del_slave = batadv_softif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 * Declare each description string in struct.name[] to get fixed sized buffer * and compile time checking for strings longer than ETH_GSTRING_LEN. */ static const struct { const char name[ETH_GSTRING_LEN]; } batadv_counters_strings[] = { { "tx" }, { "tx_bytes" }, { "tx_dropped" }, { "rx" }, { "rx_bytes" }, { "forward" }, { "forward_bytes" }, { "mgmt_tx" }, { "mgmt_tx_bytes" }, { "mgmt_rx" }, { "mgmt_rx_bytes" }, { "frag_tx" }, { "frag_tx_bytes" }, { "frag_rx" }, { "frag_rx_bytes" }, { "frag_fwd" }, { "frag_fwd_bytes" }, { "tt_request_tx" }, { "tt_request_rx" }, { "tt_response_tx" }, { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, { "dat_put_tx" }, { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif #ifdef CONFIG_BATMAN_ADV_NC { "nc_code" }, { "nc_code_bytes" }, { "nc_recode" }, { "nc_recode_bytes" }, { "nc_buffer" }, { "nc_decode" }, { "nc_decode_bytes" }, { "nc_decode_failed" }, { "nc_sniffed" }, #endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, sizeof(batadv_counters_strings)); } static void batadv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; for (i = 0; i < BATADV_CNT_NUM; i++) data[i] = batadv_sum_counter(bat_priv, i); } static int batadv_get_sset_count(struct net_device *dev, int stringset) { if (stringset == ETH_SS_STATS) return BATADV_CNT_NUM; return -EOPNOTSUPP; } static const struct ethtool_ops batadv_ethtool_ops = { .get_drvinfo = batadv_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = batadv_get_strings, .get_ethtool_stats = batadv_get_ethtool_stats, .get_sset_count = batadv_get_sset_count, }; /** * batadv_softif_free() - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove */ static void batadv_softif_free(struct net_device *dev) { batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish * their tasks. Wait for them all to be finished before freeing the * netdev and its private data (bat_priv) */ rcu_barrier(); } /** * batadv_softif_init_early() - early stage initialization of soft interface * @dev: registered network device to modify */ static void batadv_softif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; /* generate random address */ eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; } /** * batadv_softif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; if (!data) return 0; if (data[IFLA_BATADV_ALGO_NAME]) { algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); if (!algo_ops) return -EINVAL; } return 0; } /** * batadv_softif_newlink() - pre-initialize and register new batadv link * @src_net: the applicable net namespace * @dev: network device to register * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); const char *algo_name; int err; if (data && data[IFLA_BATADV_ALGO_NAME]) { algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); err = batadv_algo_select(bat_priv, algo_name); if (err) return -EINVAL; } return register_netdevice(dev); } /** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface * @head: list pointer */ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); } unregister_netdevice_queue(soft_iface, head); } /** * batadv_softif_is_valid() - Check whether device is a batadv soft interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; return false; } static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, }; struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_softif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, .validate = batadv_softif_validate, .newlink = batadv_softif_newlink, .dellink = batadv_softif_destroy_netlink, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/mm.h> #include <asm/io.h> struct scatterlist { unsigned long page_link; unsigned int offset; unsigned int length; dma_addr_t dma_address; #ifdef CONFIG_NEED_SG_DMA_LENGTH unsigned int dma_length; #endif }; /* * These macros should be used after a dma_map_sg call has been done * to get bus addresses of each of the SG entries and their lengths. * You should only work with the number of sg entries dma_map_sg * returns, or alternatively stop on the first sg_dma_len(sg) which * is 0. */ #define sg_dma_address(sg) ((sg)->dma_address) #ifdef CONFIG_NEED_SG_DMA_LENGTH #define sg_dma_len(sg) ((sg)->dma_length) #else #define sg_dma_len(sg) ((sg)->length) #endif struct sg_table { struct scatterlist *sgl; /* the list */ unsigned int nents; /* number of mapped entries */ unsigned int orig_nents; /* original size of list */ }; struct sg_append_table { struct sg_table sgt; /* The scatter list table */ struct scatterlist *prv; /* last populated sge in the table */ unsigned int total_nents; /* Total entries in the table */ }; /* * Notes on SG table design. * * We use the unsigned long page_link field in the scatterlist struct to place * the page pointer AND encode information about the sg table as well. The two * lower bits are reserved for this information. * * If bit 0 is set, then the page_link contains a pointer to the next sg * table list. Otherwise the next entry is at sg + 1. * * If bit 1 is set, then this sg entry is the last element in a list. * * See sg_next(). * */ #define SG_CHAIN 0x01UL #define SG_END 0x02UL /* * We overload the LSB of the page pointer to indicate whether it's * a valid sg entry, or whether it points to the start of a new scatterlist. * Those low bits are there for everyone! (thanks mason :-) */ #define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) #define sg_is_last(sg) ((sg)->page_link & SG_END) #define sg_chain_ptr(sg) \ ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END))) /** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry * @page: The page * * Description: * Assign page to sg entry. Also see sg_set_page(), the most commonly used * variant. * **/ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) { unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END); /* * In order for the low bit stealing approach to work, pages * must be aligned at a 32-bit boundary as a minimum. */ BUG_ON((unsigned long) page & (SG_CHAIN | SG_END)); #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; } /** * sg_set_page - Set sg entry to point at given page * @sg: SG entry * @page: The page * @len: Length of data * @offset: Offset into page * * Description: * Use this function to set an sg entry pointing at a page, never assign * the page directly. We encode sg table information in the lower bits * of the page pointer. See sg_page() for looking up the page belonging * to an sg entry. * **/ static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { sg_assign_page(sg, page); sg->offset = offset; sg->length = len; } static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END)); } /** * sg_set_buf - Set sg entry to point at given data * @sg: SG entry * @buf: Data * @buflen: Data length * **/ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) { #ifdef CONFIG_DEBUG_SG BUG_ON(!virt_addr_valid(buf)); #endif sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); } /* * Loop over each sg element, following the pointer to a new list if necessary */ #define for_each_sg(sglist, sg, nr, __i) \ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) /* * Loop over each sg element in the given sg_table object. */ #define for_each_sgtable_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i) /* * Loop over each sg element in the given *DMA mapped* sg_table object. * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses * of the each element. */ #define for_each_sgtable_dma_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->nents, i) static inline void __sg_chain(struct scatterlist *chain_sg, struct scatterlist *sgl) { /* * offset and length are unused for chain entry. Clear them. */ chain_sg->offset = 0; chain_sg->length = 0; /* * Set lowest bit to indicate a link pointer, and make sure to clear * the termination bit if it happens to be set. */ chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END; } /** * sg_chain - Chain two sglists together * @prv: First scatterlist * @prv_nents: Number of entries in prv * @sgl: Second scatterlist * * Description: * Links @prv@ and @sgl@ together, to form a longer scatterlist. * **/ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { __sg_chain(&prv[prv_nents - 1], sgl); } /** * sg_mark_end - Mark the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Marks the passed in sg entry as the termination point for the sg * table. A call to sg_next() on this entry will return NULL. * **/ static inline void sg_mark_end(struct scatterlist *sg) { /* * Set termination bit, clear potential chain bit */ sg->page_link |= SG_END; sg->page_link &= ~SG_CHAIN; } /** * sg_unmark_end - Undo setting the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Removes the termination marker from the given entry of the scatterlist. * **/ static inline void sg_unmark_end(struct scatterlist *sg) { sg->page_link &= ~SG_END; } /** * sg_phys - Return physical address of an sg entry * @sg: SG entry * * Description: * This calls page_to_phys() on the page in this sg entry, and adds the * sg offset. The caller must know that it is legal to call page_to_phys() * on the sg page. * **/ static inline dma_addr_t sg_phys(struct scatterlist *sg) { return page_to_phys(sg_page(sg)) + sg->offset; } /** * sg_virt - Return virtual address of an sg entry * @sg: SG entry * * Description: * This calls page_address() on the page in this sg entry, and adds the * sg offset. The caller must know that the sg page has a valid virtual * mapping. * **/ static inline void *sg_virt(struct scatterlist *sg) { return page_address(sg_page(sg)) + sg->offset; } /** * sg_init_marker - Initialize markers in sg table * @sgl: The SG table * @nents: Number of entries in table * **/ static inline void sg_init_marker(struct scatterlist *sgl, unsigned int nents) { sg_mark_end(&sgl[nents - 1]); } int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); int sg_split(struct scatterlist *in, const int in_mapped_nents, const off_t skip, const int nb_splits, const size_t *split_sizes, struct scatterlist **out, int *out_mapped_nents, gfp_t gfp_mask); typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t); typedef void (sg_free_fn)(struct scatterlist *, unsigned int); void __sg_free_table(struct sg_table *, unsigned int, unsigned int, sg_free_fn *, unsigned int); void sg_free_table(struct sg_table *); void sg_free_append_table(struct sg_append_table *sgt); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); int sg_alloc_append_table_from_pages(struct sg_append_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, unsigned int left_pages, gfp_t gfp_mask); int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, gfp_t gfp_mask); /** * sg_alloc_table_from_pages - Allocate and initialize an sg table from * an array of pages * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node. A user * may provide an offset at a start and a size of valid data in a buffer * specified by the page array. The returned sg table is released by * sg_free_table. * * Returns: * 0 on success, negative error on failure */ static inline int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask) { return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset, size, UINT_MAX, gfp_mask); } #ifdef CONFIG_SGL_ALLOC struct scatterlist *sgl_alloc_order(unsigned long long length, unsigned int order, bool chainable, gfp_t gfp, unsigned int *nent_p); struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p); void sgl_free_n_order(struct scatterlist *sgl, int nents, int order); void sgl_free_order(struct scatterlist *sgl, int order); void sgl_free(struct scatterlist *sgl); #endif /* CONFIG_SGL_ALLOC */ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, size_t buflen, off_t skip); /* * Maximum number of entries that will be allocated in one piece, if * a list larger than this is required then chaining will be utilized. */ #define SG_MAX_SINGLE_ALLOC (PAGE_SIZE / sizeof(struct scatterlist)) /* * The maximum number of SG segments that we will put inside a * scatterlist (unless chaining is used). Should ideally fit inside a * single page, to avoid a higher order allocation. We could define this * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The * minimum value is 32 */ #define SG_CHUNK_SIZE 128 /* * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ #ifdef CONFIG_ARCH_NO_SG_CHAIN #define SG_MAX_SEGMENTS SG_CHUNK_SIZE #else #define SG_MAX_SEGMENTS 2048 #endif #ifdef CONFIG_SG_POOL void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk); int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk); #endif /* * sg page iterator * * Iterates over sg entries page-by-page. On each successful iteration, you * can call sg_page_iter_page(@piter) to get the current page. * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to * the page's page offset within the sg. The iteration will stop either when a * maximum number of sg entries was reached or a terminating sg * (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ unsigned int sg_pgoffset; /* page offset within the sg */ /* these are internal states, keep away */ unsigned int __nents; /* remaining sg entries */ int __pg_advance; /* nr pages to advance at the * next step */ }; /* * sg page iterator for DMA addresses * * This is the same as sg_page_iter however you can call * sg_page_iter_dma_address(@dma_iter) to get the page's DMA * address. sg_page_iter_page() cannot be called on this iterator. */ struct sg_dma_page_iter { struct sg_page_iter base; }; bool __sg_page_iter_next(struct sg_page_iter *piter); bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); /** * sg_page_iter_page - get the current page held by the page iterator * @piter: page iterator holding the page */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { return nth_page(sg_page(piter->sg), piter->sg_pgoffset); } /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. * @dma_iter: page iterator holding the page */ static inline dma_addr_t sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { return sg_dma_address(dma_iter->base.sg) + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** * for_each_sg_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_page() to get each page pointer. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) /** * for_each_sg_dma_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @dma_iter: DMA page iterator to hold current page * @dma_nents: maximum number of sg entries to iterate over, this is the value * returned from dma_map_sg * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_dma_address() to get each page's DMA address. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ pgoffset); \ __sg_page_iter_dma_next(dma_iter);) /** * for_each_sgtable_page - iterate over all pages in the sg_table object * @sgt: sg_table object to iterate over * @piter: page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all memory pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. */ #define for_each_sgtable_page(sgt, piter, pgoffset) \ for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset) /** * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object * @sgt: sg_table object to iterate over * @dma_iter: DMA page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all DMA mapped pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE * unit. */ #define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset) /* * Mapping sg iterator * * Iterates over sg entries mapping page-by-page. On each successful * iteration, @miter->page points to the mapped page and * @miter->length bytes of data can be accessed at @miter->addr. As * long as an iteration is enclosed between start and stop, the user * is free to choose control structure and when to stop. * * @miter->consumed is set to @miter->length on each iteration. It * can be adjusted if the user can't consume all the bytes in one go. * Also, a stopped iteration can be resumed by calling next on it. * This is useful when iteration needs to release all resources and * continue later (e.g. at the next interrupt). */ #define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */ #define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */ #define SG_MITER_FROM_SG (1 << 2) /* nop */ struct sg_mapping_iter { /* the following three fields can be accessed directly */ struct page *page; /* currently mapped page */ void *addr; /* pointer to the mapped area */ size_t length; /* length of the mapped area */ size_t consumed; /* number of consumed bytes */ struct sg_page_iter piter; /* page iterator */ /* these are internal states, keep away */ unsigned int __offset; /* offset within page */ unsigned int __remaining; /* remaining bytes on page */ unsigned int __flags; }; void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, unsigned int nents, unsigned int flags); bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset); bool sg_miter_next(struct sg_mapping_iter *miter); void sg_miter_stop(struct sg_mapping_iter *miter); #endif /* _LINUX_SCATTERLIST_H */
47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 // SPDX-License-Identifier: GPL-2.0 /* * xfrm6_policy.c: based on xfrm4_policy.c * * Authors: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * YOSHIFUJI Hideaki * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/l3mdev.h> static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl6.flowi6_mark = params->mark; memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); if (params->saddr) memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); fl6.flowi4_proto = params->ipproto; fl6.uli = params->uli; dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { dst_release(dst); dst = ERR_PTR(err); } return dst; } static int xfrm6_get_saddr(xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; struct inet6_dev *idev; dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; idev = ip6_dst_idev(dst); if (!idev) { dst_release(dst); return -EHOSTUNREACH; } dev = idev->dev; ipv6_dev_get_saddr(dev_net(dev), dev, &params->daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; } static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rt6_info *rt = (struct rt6_info *)xdst->route; xdst->u.dst.dev = dev; dev_hold(dev); xdst->u.rt6.rt6i_idev = in6_dev_get(dev); if (!xdst->u.rt6.rt6i_idev) { dev_put(dev); return -ENODEV; } /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached); rt6_uncached_list_add(&xdst->u.rt6); atomic_inc(&dev_net(dev)->ipv6.rt6_stats->fib_rt_uncache); return 0; } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); if (xdst->u.rt6.rt6i_uncached_list) rt6_uncached_list_del(&xdst->u.rt6); if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); xfrm_dst_destroy(xdst); } static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int unregister) { struct xfrm_dst *xdst; if (!unregister) return; xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); } xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, }; static int __init xfrm6_policy_init(void) { return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6); } static void xfrm6_policy_fini(void) { xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } #ifdef CONFIG_SYSCTL static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = xfrm6_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; } hdr = register_net_sysctl(net, "net/ipv6", table); if (!hdr) goto err_reg; net->ipv6.sysctl.xfrm6_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static inline int xfrm6_net_sysctl_init(struct net *net) { return 0; } static inline void xfrm6_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm6_net_init(struct net *net) { int ret; memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, sizeof(xfrm6_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); if (ret) return ret; ret = xfrm6_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); return ret; } static void __net_exit xfrm6_net_exit(struct net *net) { xfrm6_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); } static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; int __init xfrm6_init(void) { int ret; ret = xfrm6_policy_init(); if (ret) goto out; ret = xfrm6_state_init(); if (ret) goto out_policy; ret = xfrm6_protocol_init(); if (ret) goto out_state; ret = register_pernet_subsys(&xfrm6_net_ops); if (ret) goto out_protocol; out: return ret; out_protocol: xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; } void xfrm6_fini(void) { unregister_pernet_subsys(&xfrm6_net_ops); xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); }
15 15 2 1 1 1 3 23 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the UDP-Lite (RFC 3828) code. */ #ifndef _UDPLITE_H #define _UDPLITE_H #include <net/ip6_checksum.h> /* UDP-Lite socket options */ #define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ #define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ extern struct proto udplite_prot; extern struct udp_table udplite_table; /* * Checksum computation is all in software, hence simpler getfrag. */ static __inline__ int udplite_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; } /* * Checksumming routines */ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) { u16 cscov; /* In UDPv4 a zero checksum means that the transmitter generated no * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets * with a zero checksum field are illegal. */ if (uh->check == 0) { net_dbg_ratelimited("UDPLite: zeroed checksum field\n"); return 1; } cscov = ntohs(uh->len); if (cscov == 0) /* Indicates that full coverage is required. */ ; else if (cscov < 8 || cscov > skb->len) { /* * Coverage length violates RFC 3828: log and discard silently. */ net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n", cscov, skb->len); return 1; } else if (cscov < skb->len) { UDP_SKB_CB(skb)->partial_cov = 1; UDP_SKB_CB(skb)->cscov = cscov; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; skb->csum_valid = 0; } return 0; } /* Slow-path computation of checksum. Socket is locked. */ static inline __wsum udplite_csum_outgoing(struct sock *sk, struct sk_buff *skb) { const struct udp_sock *up = udp_sk(skb->sk); int cscov = up->len; __wsum csum = 0; if (up->pcflag & UDPLITE_SEND_CC) { /* * Sender has set `partial coverage' option on UDP-Lite socket. * The special case "up->pcslen == 0" signifies full coverage. */ if (up->pcslen < up->len) { if (0 < up->pcslen) cscov = up->pcslen; udp_hdr(skb)->len = htons(up->pcslen); } /* * NOTE: Causes for the error case `up->pcslen > up->len': * (i) Application error (will not be penalized). * (ii) Payload too big for send buffer: data is split * into several packets, each with its own header. * In this case (e.g. last segment), coverage may * exceed packet length. * Since packets with coverage length > packet length are * illegal, we fall back to the defaults here. */ } skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ skb_queue_walk(&sk->sk_write_queue, skb) { const int off = skb_transport_offset(skb); const int len = skb->len - off; csum = skb_checksum(skb, off, (cscov > len)? len : cscov, csum); if ((cscov -= len) <= 0) break; } return csum; } /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { const struct udp_sock *up = udp_sk(skb->sk); const int off = skb_transport_offset(skb); int len = skb->len - off; if ((up->pcflag & UDPLITE_SEND_CC) && up->pcslen < len) { if (0 < up->pcslen) len = up->pcslen; udp_hdr(skb)->len = htons(up->pcslen); } skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ return skb_checksum(skb, off, len, 0); } void udplite4_register(void); int udplite_get_port(struct sock *sk, unsigned short snum, int (*scmp)(const struct sock *, const struct sock *)); #endif /* _UDPLITE_H */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * CLC (connection layer control) handshake over initial TCP socket to * prepare for RDMA traffic * * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/in.h> #include <linux/inetdevice.h> #include <linux/if_ether.h> #include <linux/sched/signal.h> #include <linux/utsname.h> #include <linux/ctype.h> #include <net/addrconf.h> #include <net/sock.h> #include <net/tcp.h> #include "smc.h" #include "smc_core.h" #include "smc_clc.h" #include "smc_ib.h" #include "smc_ism.h" #define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 #define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 #define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78 #define SMC_CLC_RECV_BUF_LEN 100 /* eye catcher "SMCR" EBCDIC for CLC messages */ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; /* eye catcher "SMCD" EBCDIC for CLC messages */ static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN]; /* check arriving CLC proposal */ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; struct smc_clc_smcd_v2_extension *smcd_v2_ext; struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx || pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) return false; if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; if (ntohs(hdr->length) != sizeof(*pclc) + ntohs(pclc->iparea_offset) + sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(struct smc_clc_ipv6_prefix) + sizeof(struct smc_clc_msg_trail)) return false; } else { if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + (hdr->typev1 != SMC_TYPE_N ? sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(struct smc_clc_ipv6_prefix) : 0) + (hdr->typev2 != SMC_TYPE_N ? sizeof(*v2_ext) + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) + (smcd_indicated(hdr->typev2) ? sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt * sizeof(struct smc_clc_smcd_gid_chid) : 0) + sizeof(struct smc_clc_msg_trail)) return false; } return true; } /* check arriving CLC accept or confirm */ static bool smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) { struct smc_clc_msg_hdr *hdr = &clc_v2->hdr; if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) return false; if (hdr->version == SMC_V1) { if ((hdr->typev1 == SMC_TYPE_R && ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || (hdr->typev1 == SMC_TYPE_D && ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) return false; } else { if (hdr->typev1 == SMC_TYPE_D && ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 && (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 + sizeof(struct smc_clc_first_contact_ext))) return false; } return true; } static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len) { memset(fce, 0, sizeof(*fce)); fce->os_type = SMC_CLC_OS_LINUX; fce->release = SMC_RELEASE; memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname)); (*len) += sizeof(*fce); } /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers */ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) { struct smc_clc_msg_accept_confirm_v2 *clc_v2; struct smc_clc_msg_proposal *pclc; struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; switch (clcm->type) { case SMC_CLC_PROPOSAL: pclc = (struct smc_clc_msg_proposal *)clcm; if (!smc_clc_msg_prop_valid(pclc)) return false; trl = (struct smc_clc_msg_trail *) ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm; if (!smc_clc_msg_acc_conf_valid(clc_v2)) return false; trl = (struct smc_clc_msg_trail *) ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) - sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; if (ntohs(dclc->hdr.length) != sizeof(*dclc)) return false; trl = &dclc->trl; break; default: return false; } if (check_trl && memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; return true; } /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dst->dev); const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (!inet_ifa_match(ipv4, ifa)) continue; prop->prefix_len = inet_mask_len(ifa->ifa_mask); prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ return 0; } return -ENOENT; } /* fill CLC proposal msg with ipv6 prefixes from device */ static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); struct inet6_ifaddr *ifa; int cnt = 0; if (!in6_dev) return -ENODEV; /* use a maximum of 8 IPv6 prefixes from device */ list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) continue; ipv6_addr_prefix(&ipv6_prfx[cnt].prefix, &ifa->addr, ifa->prefix_len); ipv6_prfx[cnt].prefix_len = ifa->prefix_len; cnt++; if (cnt == SMC_CLC_MAX_V6_PREFIX) break; } prop->ipv6_prefixes_cnt = cnt; if (cnt) return 0; #endif return -ENOENT; } /* retrieve and set prefixes in CLC proposal msg */ static int smc_clc_prfx_set(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct sockaddr_storage addrs; struct sockaddr_in6 *addr6; struct sockaddr_in *addr; int rc = -ENOENT; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } /* get address to which the internal TCP socket is bound */ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) goto out_rel; /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); if (addrs.ss_family == PF_INET) { /* IPv4 */ addr = (struct sockaddr_in *)&addrs; rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { /* mapped IPv4 address - peer is IPv4 only */ rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], prop); } else { /* IPv6 */ rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); } rcu_read_unlock(); out_rel: dst_release(dst); out: return rc; } /* match ipv4 addrs of dev against addr in CLC proposal */ static int smc_clc_prfx_match4_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && inet_ifa_match(prop->outgoing_subnet, ifa)) return 0; } return -ENOENT; } /* match ipv6 addrs of dev against addrs in CLC proposal */ static int smc_clc_prfx_match6_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *in6_dev = __in6_dev_get(dev); struct smc_clc_ipv6_prefix *ipv6_prfx; struct inet6_ifaddr *ifa; int i, max; if (!in6_dev) return -ENODEV; /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */ ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop)); max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX); list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) continue; for (i = 0; i < max; i++) { if (ifa->prefix_len == ipv6_prfx[i].prefix_len && ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix, ifa->prefix_len)) return 0; } } #endif return -ENOENT; } /* check if proposed prefixes match one of our device prefixes */ int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop) { struct dst_entry *dst = sk_dst_get(clcsock->sk); int rc; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } rcu_read_lock(); if (!prop->ipv6_prefixes_cnt) rc = smc_clc_prfx_match4_rcu(dst->dev, prop); else rc = smc_clc_prfx_match6_rcu(dst->dev, prop); rcu_read_unlock(); out_rel: dst_release(dst); out: return rc; } /* Wait for data on the tcp-socket, analyze received data * Returns: * 0 if success and it was not a decline that we received. * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send. * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. */ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout) { long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; struct sock *clc_sk = smc->clcsock->sk; struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; int reason_code = 0; struct kvec vec = {buf, buflen}; int len, datlen, recvlen; bool check_trl = true; int krflags; /* peek the first few bytes to determine length of data to receive * so we don't consume any subsequent CLC message or payload data * in the TCP byte stream */ /* * Caller must make sure that buflen is no less than * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; clc_sk->sk_rcvtimeo = timeout; iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { reason_code = -EINTR; clc_sk->sk_err = EINTR; smc->sk.sk_err = EINTR; goto out; } if (clc_sk->sk_err) { reason_code = -clc_sk->sk_err; if (clc_sk->sk_err == EAGAIN && expected_type == SMC_CLC_DECLINE) clc_sk->sk_err = 0; /* reset for fallback usage */ else smc->sk.sk_err = clc_sk->sk_err; goto out; } if (!len) { /* peer has performed orderly shutdown */ smc->sk.sk_err = ECONNRESET; reason_code = -ECONNRESET; goto out; } if (len < 0) { if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE) smc->sk.sk_err = -len; reason_code = len; goto out; } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || (clcm->version < SMC_V1) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); if (datlen > buflen) { check_trl = false; recvlen = buflen; } else { recvlen = datlen; } iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } datlen -= len; while (datlen) { u8 tmp[SMC_CLC_RECV_BUF_LEN]; vec.iov_base = &tmp; vec.iov_len = SMC_CLC_RECV_BUF_LEN; /* receive remaining proposal message */ recvlen = datlen > SMC_CLC_RECV_BUF_LEN ? SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < recvlen) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { struct smc_clc_msg_decline *dclc; dclc = (struct smc_clc_msg_decline *)clcm; reason_code = SMC_CLC_DECL_PEERDECL; smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 & SMC_FIRST_CONTACT_MASK) { smc->conn.lgr->sync_err = 1; smc_lgr_terminate_sched(smc->conn.lgr); } } out: clc_sk->sk_rcvtimeo = rcvtimeo; return reason_code; } /* send CLC DECLINE message across internal TCP socket */ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) { struct smc_clc_msg_decline dclc; struct msghdr msg; struct kvec vec; int len; memset(&dclc, 0, sizeof(dclc)); memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = version; dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? SMC_FIRST_CONTACT_MASK : 0; if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) && smc_ib_is_valid_local_systemid()) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = sizeof(struct smc_clc_msg_decline); len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(struct smc_clc_msg_decline)); if (len < 0 || len < sizeof(struct smc_clc_msg_decline)) len = -EPROTO; return len > 0 ? 0 : len; } /* send CLC PROPOSAL message across internal TCP socket */ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *smcd_v2_ext; struct smc_clc_msg_proposal_prefix *pclc_prfx; struct smc_clc_msg_proposal *pclc_base; struct smc_clc_smcd_gid_chid *gidchids; struct smc_clc_msg_proposal_area *pclc; struct smc_clc_ipv6_prefix *ipv6_prfx; struct smc_clc_v2_extension *v2_ext; struct smc_clc_msg_smcd *pclc_smcd; struct smc_clc_msg_trail *trl; int len, i, plen, rc; int reason_code = 0; struct kvec vec[8]; struct msghdr msg; pclc = kzalloc(sizeof(*pclc), GFP_KERNEL); if (!pclc) return -ENOMEM; pclc_base = &pclc->pclc_base; pclc_smcd = &pclc->pclc_smcd; pclc_prfx = &pclc->pclc_prfx; ipv6_prfx = pclc->pclc_prfx_ipv6; v2_ext = &pclc->pclc_v2_ext; smcd_v2_ext = &pclc->pclc_smcd_v2_ext; gidchids = pclc->pclc_gidchids; trl = &pclc->pclc_trl; pclc_base->hdr.version = SMC_V2; pclc_base->hdr.typev1 = ini->smc_type_v1; pclc_base->hdr.typev2 = ini->smc_type_v2; plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl); /* retrieve ip prefixes for CLC proposal msg */ if (ini->smc_type_v1 != SMC_TYPE_N) { rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx); if (rc) { if (ini->smc_type_v2 == SMC_TYPE_N) { kfree(pclc); return SMC_CLC_DECL_CNFERR; } pclc_base->hdr.typev1 = SMC_TYPE_N; } else { pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); plen += sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); } } /* build SMC Proposal CLC message */ memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc_base->hdr.type = SMC_CLC_PROPOSAL; if (smcr_indicated(ini->smc_type_v1)) { /* add SMC-R specifics */ memcpy(pclc_base->lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE); memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], ETH_ALEN); } if (smcd_indicated(ini->smc_type_v1)) { /* add SMC-D specifics */ if (ini->ism_dev[0]) { pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); } } if (ini->smc_type_v2 == SMC_TYPE_N) { pclc_smcd->v2_ext_offset = 0; } else { u16 v2_ext_offset; u8 *eid = NULL; v2_ext_offset = sizeof(*pclc_smcd) - offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); if (ini->smc_type_v1 != SMC_TYPE_N) v2_ext_offset += sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); pclc_smcd->v2_ext_offset = htons(v2_ext_offset); v2_ext->hdr.eid_cnt = 0; v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt; v2_ext->hdr.flag.release = SMC_RELEASE; v2_ext->hdr.flag.seid = 1; v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); if (ini->ism_dev[0]) smc_ism_get_system_eid(ini->ism_dev[0], &eid); else smc_ism_get_system_eid(ini->ism_dev[1], &eid); if (eid) memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN); plen += sizeof(*v2_ext) + sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { gidchids[i - 1].gid = htonll(ini->ism_dev[i]->local_gid); gidchids[i - 1].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); } plen += ini->ism_offered_cnt * sizeof(struct smc_clc_smcd_gid_chid); } } pclc_base->hdr.length = htons(plen); memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); /* send SMC Proposal CLC message */ memset(&msg, 0, sizeof(msg)); i = 0; vec[i].iov_base = pclc_base; vec[i++].iov_len = sizeof(*pclc_base); vec[i].iov_base = pclc_smcd; vec[i++].iov_len = sizeof(*pclc_smcd); if (ini->smc_type_v1 != SMC_TYPE_N) { vec[i].iov_base = pclc_prfx; vec[i++].iov_len = sizeof(*pclc_prfx); if (pclc_prfx->ipv6_prefixes_cnt > 0) { vec[i].iov_base = ipv6_prfx; vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); } } if (ini->smc_type_v2 != SMC_TYPE_N) { vec[i].iov_base = v2_ext; vec[i++].iov_len = sizeof(*v2_ext); vec[i].iov_base = smcd_v2_ext; vec[i++].iov_len = sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { vec[i].iov_base = gidchids; vec[i++].iov_len = ini->ism_offered_cnt * sizeof(struct smc_clc_smcd_gid_chid); } } vec[i].iov_base = trl; vec[i++].iov_len = sizeof(*trl); /* due to the few bytes needed for clc-handshake this cannot block */ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); if (len < 0) { smc->sk.sk_err = smc->clcsock->sk->sk_err; reason_code = -smc->sk.sk_err; } else if (len < ntohs(pclc_base->hdr.length)) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; } kfree(pclc); return reason_code; } /* build and send CLC CONFIRM / ACCEPT message */ static int smc_clc_send_confirm_accept(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *clc_v2, int first_contact, u8 version) { struct smc_connection *conn = &smc->conn; struct smc_clc_msg_accept_confirm *clc; struct smc_clc_first_contact_ext fce; struct smc_clc_msg_trail trl; struct kvec vec[3]; struct msghdr msg; int i, len; /* send SMC Confirm CLC msg */ clc = (struct smc_clc_msg_accept_confirm *)clc_v2; clc->hdr.version = version; /* SMC version */ if (first_contact) clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK; if (conn->lgr->is_smcd) { /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = conn->lgr->smcd->local_gid; clc->d0.token = conn->rmb_desc->token; clc->d0.dmbe_size = conn->rmbe_size_short; clc->d0.dmbe_idx = 0; memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); if (version == SMC_V1) { clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); } else { u8 *eid = NULL; clc_v2->chid = htons(smc_ism_get_chid(conn->lgr->smcd)); smc_ism_get_system_eid(conn->lgr->smcd, &eid); if (eid) memcpy(clc_v2->eid, eid, SMC_MAX_EID_LEN); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) smc_clc_fill_fce(&fce, &len); clc_v2->hdr.length = htons(len); } memcpy(trl.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); } else { struct smc_link *link = conn->lnk; /* SMC-R specific settings */ link = conn->lnk; memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_R; clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); memcpy(clc->r0.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(clc->r0.qpn, link->roce_qp->qp_num); clc->r0.rmb_rkey = htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); break; } clc->r0.rmbe_size = conn->rmbe_size_short; clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } memset(&msg, 0, sizeof(msg)); i = 0; vec[i].iov_base = clc_v2; if (version > SMC_V1) vec[i++].iov_len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 - sizeof(trl); else vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? SMCD_CLC_ACCEPT_CONFIRM_LEN : SMCR_CLC_ACCEPT_CONFIRM_LEN) - sizeof(trl); if (version > SMC_V1 && first_contact) { vec[i].iov_base = &fce; vec[i++].iov_len = sizeof(fce); } vec[i].iov_base = &trl; vec[i++].iov_len = sizeof(trl); return kernel_sendmsg(smc->clcsock, &msg, vec, 1, ntohs(clc->hdr.length)); } /* send CLC CONFIRM message across internal TCP socket */ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version) { struct smc_clc_msg_accept_confirm_v2 cclc_v2; int reason_code = 0; int len; /* send SMC Confirm CLC msg */ memset(&cclc_v2, 0, sizeof(cclc_v2)); cclc_v2.hdr.type = SMC_CLC_CONFIRM; len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact, version); if (len < ntohs(cclc_v2.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; } else { smc->sk.sk_err = smc->clcsock->sk->sk_err; reason_code = -smc->sk.sk_err; } } return reason_code; } /* send CLC ACCEPT message across internal TCP socket */ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, u8 version) { struct smc_clc_msg_accept_confirm_v2 aclc_v2; int len; memset(&aclc_v2, 0, sizeof(aclc_v2)); aclc_v2.hdr.type = SMC_CLC_ACCEPT; len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, version); if (len < ntohs(aclc_v2.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; return len > 0 ? 0 : len; } void smc_clc_get_hostname(u8 **host) { *host = &smc_hostname[0]; } void __init smc_clc_init(void) { struct new_utsname *u; memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */ u = utsname(); memcpy(smc_hostname, u->nodename, min_t(size_t, strlen(u->nodename), sizeof(smc_hostname))); }
365 366 361 365 366 10 10 10 20 20 12 4 9 13 13 13 10 6 6 4 142 83 6 17 81 3 11 12 1 23 8 4 8 3 4 9 4 16 2 12 5 3 3 16 3 3 2 8 3 2 13 5 2 3 5 3 10 3 2 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/route.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> #include <net/dsfield.h> #include <net/sock_reuseport.h> #include <linux/errqueue.h> #include <linux/uaccess.h> static bool ipv6_mapped_addr_any(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!fl6->flowi6_oif) fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) fl6->flowi6_oif = np->mcast_oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; struct in6_addr *final_p, final; struct ipv6_txoptions *opt; struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; int err = 0; if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) { flowlabel = fl6_sock_lookup(sk, np->flow_label); if (IS_ERR(flowlabel)) return -EINVAL; } ip6_datagram_flow_key_init(&fl6, sk); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (fix_sk_saddr) { if (ipv6_addr_any(&np->saddr)) np->saddr = fl6.saddr; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { sk->sk_v6_rcv_saddr = fl6.saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } } ip6_sk_dst_store_flow(sk, dst, &fl6); out: fl6_sock_release(flowlabel); return err; } void ip6_datagram_release_cb(struct sock *sk) { struct dst_entry *dst; if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; } rcu_read_unlock(); ip6_datagram_dst_update(sk, false); } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *daddr, old_daddr; __be32 fl6_flowlabel = 0; __be32 old_fl6_flowlabel; __be16 old_dport; int addr_type; int err; if (usin->sin6_family == AF_INET) { if (__ipv6_only_sock(sk)) return -EAFNOSUPPORT; err = __ip4_datagram_connect(sk, uaddr, addr_len); goto ipv4_connected; } if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; if (np->sndflow) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); daddr = &usin->sin6_addr; if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (__ipv6_only_sock(sk)) { err = -ENETUNREACH; goto out; } sin.sin_family = AF_INET; sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, (struct sockaddr *) &sin, sizeof(sin)); ipv4_connected: if (err) goto out; ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr); if (ipv6_addr_any(&np->saddr) || ipv6_mapped_addr_any(&np->saddr)) ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) { ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } goto out; } if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } sk->sk_bound_dev_if = usin->sin6_scope_id; } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) sk->sk_bound_dev_if = np->mcast_oif; /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; goto out; } } /* save the current peer information before updating it */ old_daddr = sk->sk_v6_daddr; old_fl6_flowlabel = np->flow_label; old_dport = inet->inet_dport; sk->sk_v6_daddr = *daddr; np->flow_label = fl6_flowlabel; inet->inet_dport = usin->sin6_port; /* * Check for a route to destination an obtain the * destination cache for it. */ err = ip6_datagram_dst_update(sk, true); if (err) { /* Restore the socket peer info, to keep it consistent with * the old socket state */ sk->sk_v6_daddr = old_daddr; np->flow_label = old_fl6_flowlabel; inet->inet_dport = old_dport; goto out; } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: return err; } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip6_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL_GPL(ip6_datagram_connect); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); if (sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; return ip6_datagram_connect(sk, uaddr, addr_len); } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_TIME_EXCEED: case ICMPV6_DEST_UNREACH: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), icmp6_hdr(skb)->icmp6_datagram_len * 8); } } void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct ipv6_pinfo *np = inet6_sk(sk); struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; if (!np->recverr) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; serr->ee.ee_type = icmph->icmp6_type; serr->ee.ee_code = icmph->icmp6_code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - skb_network_header(skb); serr->port = port; __skb_pull(skb, payload - skb->data); if (inet6_sk(sk)->recverr_rfc4884) ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { const struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; if (!np->recverr) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; ip6_flow_hdr(iph, 0, 0); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = fl6->fl6_dport; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *iph; struct sk_buff *skb; struct ip6_mtuinfo *mtu_info; if (!np->rxopt.bits.rxpmtu) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; mtu_info = IP6CBMTU(skb); mtu_info->ip6m_mtu = mtu; mtu_info->ip6m_addr.sin6_family = AF_INET6; mtu_info->ip6m_addr.sin6_port = 0; mtu_info->ip6m_addr.sin6_flowinfo = 0; mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); skb = xchg(&np->rxpmtu, skb); kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 * errors. This is no longer true, but the test remained, so the v6 stack, * unlike v4, also honors cmsg requests on all wifi and timestamp errors. */ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, struct sock_exterr_skb *serr) { if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) return true; if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) return false; if (!IP6CB(skb)->iif) return false; return true; } /* * Handle MSG_ERRQUEUE */ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; if (skb->protocol == htons(ETH_P_IPV6)) { const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; if (np->sndflow) sin->sin6_flowinfo = ip6_flowinfo(ip6h); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), &sin->sin6_addr); sin->sin6_scope_id = 0; } *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) ip6_datagram_recv_specific_ctl(sk, msg, skb); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); if (inet_sk(sk)->cmsg_flags) ip_cmsg_recv(msg, skb); } } put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } EXPORT_SYMBOL_GPL(ipv6_recv_error); /* * Handle IPV6_RECVPATHMTU */ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ip6_mtuinfo mtu_info; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); int err; int copied; err = -EAGAIN; skb = xchg(&np->rxpmtu, NULL); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); if (sin) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; *addr_len = sizeof(*sin); } put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); err = copied; out_free_skb: kfree_skb(skb); out: return err; } void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; if (is_ipv6) { src_info.ipi6_ifindex = IP6CB(skb)->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; } else { src_info.ipi6_ifindex = PKTINFO_SKB_CB(skb)->ipi_ifindex; ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } if (src_info.ipi6_ifindex >= 0) put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } } void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet6_skb_parm *opt = IP6CB(skb); unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); if (flowinfo) put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } if (opt->lastopt && (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) { /* * Silly enough, but we need to reparse in order to * report extension headers (except for HbH) * in order. * * Also note that IPV6_RECVRTHDRDSTOPTS is NOT * (and WILL NOT be) defined because * IPV6_RECVDSTOPTS is more generic. --yoshfuji */ unsigned int off = sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { unsigned int len; u8 *ptr = nh + off; switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.dstopts) put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); break; case IPPROTO_ROUTING: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.srcrt) put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); break; case IPPROTO_AH: nexthdr = ptr[0]; len = (ptr[1] + 2) << 2; break; default: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; break; } off += len; } } /* socket options in old style */ if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { u8 *ptr = nh + opt->dst0; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; __be16 _ports[2], *ports; ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (ports) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; sin6.sin6_flowinfo = 0; sin6.sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, opt->iif); put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } } if (np->rxopt.bits.recvfragsize && opt->frag_max_size) { int val = opt->frag_max_size; put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val); } } void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { ip6_datagram_recv_common_ctl(sk, msg, skb); ip6_datagram_recv_specific_ctl(sk, msg, skb); } EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; struct ipv6_txoptions *opt = ipc6->opt; int len; int err = 0; for_each_cmsghdr(cmsg, msg) { int addr_type; if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; goto exit_f; } if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IPV6) continue; switch (cmsg->cmsg_type) { case IPV6_PKTINFO: case IPV6_2292PKTINFO: { struct net_device *dev = NULL; int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); src_idx = src_info->ipi6_ifindex; if (src_idx) { if (fl6->flowi6_oif && src_idx != fl6->flowi6_oif && (sk->sk_bound_dev_if != fl6->flowi6_oif || !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); rcu_read_lock(); if (fl6->flowi6_oif) { dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (!dev) { rcu_read_unlock(); return -ENODEV; } } else if (addr_type & IPV6_ADDR_LINKLOCAL) { rcu_read_unlock(); return -EINVAL; } if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && !ipv6_chk_acast_addr_src(net, dev, &src_info->ipi6_addr)) err = -EINVAL; else fl6->saddr = src_info->ipi6_addr; } rcu_read_unlock(); if (err) goto exit_f; break; } case IPV6_FLOWINFO: if (cmsg->cmsg_len < CMSG_LEN(4)) { err = -EINVAL; goto exit_f; } if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { err = -EINVAL; goto exit_f; } } fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } opt->opt_nflen += len; opt->hopopt = hdr; break; case IPV6_2292DSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (opt->dst1opt) { err = -EINVAL; goto exit_f; } opt->opt_flen += len; opt->dst1opt = hdr; break; case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (cmsg->cmsg_type == IPV6_DSTOPTS) { opt->opt_flen += len; opt->dst1opt = hdr; } else { opt->opt_nflen += len; opt->dst0opt = hdr; } break; case IPV6_2292RTHDR: case IPV6_RTHDR: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { err = -EINVAL; goto exit_f; } rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) { err = -EINVAL; goto exit_f; } break; #endif default: err = -EINVAL; goto exit_f; } len = ((rthdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } /* segments left must also match */ if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { err = -EINVAL; goto exit_f; } opt->opt_nflen += len; opt->srcrt = rthdr; if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); opt->opt_nflen += dsthdrlen; opt->dst0opt = opt->dst1opt; opt->dst1opt = NULL; opt->opt_flen -= dsthdrlen; } break; case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { err = -EINVAL; goto exit_f; } ipc6->hlimit = *(int *)CMSG_DATA(cmsg); if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) { err = -EINVAL; goto exit_f; } break; case IPV6_TCLASS: { int tc; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; tc = *(int *)CMSG_DATA(cmsg); if (tc < -1 || tc > 0xff) goto exit_f; err = 0; ipc6->tclass = tc; break; } case IPV6_DONTFRAG: { int df; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; df = *(int *)CMSG_DATA(cmsg); if (df < 0 || df > 1) goto exit_f; err = 0; ipc6->dontfrag = df; break; } default: net_dbg_ratelimited("invalid cmsg type: %d\n", cmsg->cmsg_type); err = -EINVAL; goto exit_f; } } exit_f: return err; } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int rqueue, int bucket) { const struct in6_addr *dest, *src; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->sk_state, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); }
39 38 38 39 39 39 39 39 39 39 21 36 39 39 39 34 34 34 34 34 34 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH * Copyright 2018-2020 Intel Corporation */ #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/export.h> #include <net/mac80211.h> #include <crypto/algapi.h> #include <asm/unaligned.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "debugfs_key.h" #include "aes_ccm.h" #include "aes_cmac.h" #include "aes_gmac.h" #include "aes_gcm.h" /** * DOC: Key handling basics * * Key handling in mac80211 is done based on per-interface (sub_if_data) * keys and per-station keys. Since each station belongs to an interface, * each station key also belongs to that interface. * * Hardware acceleration is done on a best-effort basis for algorithms * that are implemented in software, for each key the hardware is asked * to enable that key for offloading but if it cannot do that the key is * simply kept for software encryption (unless it is for an algorithm * that isn't implemented in software). * There is currently no way of knowing whether a key is handled in SW * or HW except by looking into debugfs. * * All key management is internally protected by a mutex. Within all * other parts of mac80211, key references are, just as STA structure * references, protected by RCU. Note, however, that some things are * unprotected, namely the key->sta dereferences within the hardware * acceleration functions. This means that sta_info_destroy() must * remove the key which waits for an RCU grace period. */ static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; static void assert_key_lock(struct ieee80211_local *local) { lockdep_assert_held(&local->key_mtx); } static void update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) { struct ieee80211_sub_if_data *vlan; if (sdata->vif.type != NL80211_IFTYPE_AP) return; /* crypto_tx_tailroom_needed_cnt is protected by this */ assert_key_lock(sdata->local); rcu_read_lock(); list_for_each_entry_rcu(vlan, &sdata->u.ap.vlans, u.vlan.list) vlan->crypto_tx_tailroom_needed_cnt += delta; rcu_read_unlock(); } static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) { /* * When this count is zero, SKB resizing for allocating tailroom * for IV or MMIC is skipped. But, this check has created two race * cases in xmit path while transiting from zero count to one: * * 1. SKB resize was skipped because no key was added but just before * the xmit key is added and SW encryption kicks off. * * 2. SKB resize was skipped because all the keys were hw planted but * just before xmit one of the key is deleted and SW encryption kicks * off. * * In both the above case SW encryption will find not enough space for * tailroom and exits with WARN_ON. (See WARN_ONs at wpa.c) * * Solution has been explained at * http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net */ assert_key_lock(sdata->local); update_vlan_tailroom_need_count(sdata, 1); if (!sdata->crypto_tx_tailroom_needed_cnt++) { /* * Flush all XMIT packets currently using HW encryption or no * encryption at all if the count transition is from 0 -> 1. */ synchronize_net(); } } static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) { assert_key_lock(sdata->local); WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta); update_vlan_tailroom_need_count(sdata, -delta); sdata->crypto_tx_tailroom_needed_cnt -= delta; } static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata = key->sdata; struct sta_info *sta; int ret = -EOPNOTSUPP; might_sleep(); if (key->flags & KEY_FLAG_TAINTED) { /* If we get here, it's during resume and the key is * tainted so shouldn't be used/programmed any more. * However, its flags may still indicate that it was * programmed into the device (since we're in resume) * so clear that flag now to avoid trying to remove * it again later. */ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(sdata); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; return -EINVAL; } if (!key->local->ops->set_key) goto out_unsupported; assert_key_lock(key->local); sta = key->sta; /* * If this is a per-STA GTK, check if it * is supported; if not, return. */ if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) && !ieee80211_hw_check(&key->local->hw, SUPPORTS_PER_STA_GTK)) goto out_unsupported; if (sta && !sta->uploaded) goto out_unsupported; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { /* * The driver doesn't know anything about VLAN interfaces. * Hence, don't send GTKs for VLAN interfaces to the driver. */ if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) { ret = 1; goto out_unsupported; } } ret = drv_set_key(key->local, SET_KEY, sdata, sta ? &sta->sta : NULL, &key->conf); if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) decrease_tailroom_need_count(sdata, 1); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)); return 0; } if (ret != -ENOSPC && ret != -EOPNOTSUPP && ret != 1) sdata_err(sdata, "failed to set key (%d, %pM) to hardware (%d)\n", key->conf.keyidx, sta ? sta->sta.addr : bcast_addr, ret); out_unsupported: switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* all of these we can do in software - if driver can */ if (ret == 1) return 0; if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) return -EINVAL; return 0; default: return -EINVAL; } } static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata; struct sta_info *sta; int ret; might_sleep(); if (!key || !key->local->ops->set_key) return; assert_key_lock(key->local); if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) return; sta = key->sta; sdata = key->sdata; if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(sdata); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; ret = drv_set_key(key->local, DISABLE_KEY, sdata, sta ? &sta->sta : NULL, &key->conf); if (ret) sdata_err(sdata, "failed to remove key (%d, %pM) from hardware (%d)\n", key->conf.keyidx, sta ? sta->sta.addr : bcast_addr, ret); } static int _ieee80211_set_tx_key(struct ieee80211_key *key, bool force) { struct sta_info *sta = key->sta; struct ieee80211_local *local = key->local; assert_key_lock(local); set_sta_flag(sta, WLAN_STA_USES_ENCRYPTION); sta->ptk_idx = key->conf.keyidx; if (force || !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; } int ieee80211_set_tx_key(struct ieee80211_key *key) { return _ieee80211_set_tx_key(key, false); } static void ieee80211_pairwise_rekey(struct ieee80211_key *old, struct ieee80211_key *new) { struct ieee80211_local *local = new->local; struct sta_info *sta = new->sta; int i; assert_key_lock(local); if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) { /* Extended Key ID key install, initial one or rekey */ if (sta->ptk_idx != INVALID_PTK_KEYIDX && !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) { /* Aggregation Sessions with Extended Key ID must not * mix MPDUs with different keyIDs within one A-MPDU. * Tear down running Tx aggregation sessions and block * new Rx/Tx aggregation requests during rekey to * ensure there are no A-MPDUs when the driver is not * supporting A-MPDU key borders. (Blocking Tx only * would be sufficient but WLAN_STA_BLOCK_BA gets the * job done for the few ms we need it.) */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); mutex_lock(&sta->ampdu_mlme.mtx); for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_tx_ba_session(sta, i, AGG_STOP_LOCAL_REQUEST); mutex_unlock(&sta->ampdu_mlme.mtx); } } else if (old) { /* Rekey without Extended Key ID. * Aggregation sessions are OK when running on SW crypto. * A broken remote STA may cause issues not observed with HW * crypto, though. */ if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) return; /* Stop Tx till we are on the new key */ old->flags |= KEY_FLAG_TAINTED; ieee80211_clear_fast_xmit(sta); if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_LOCAL_REQUEST); } if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) { pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.", sta->sta.addr); /* Flushing the driver queues *may* help prevent * the clear text leaks and freezes. */ ieee80211_flush_queues(local, old->sdata, false); } } } static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx, bool uni, bool multi) { struct ieee80211_key *key = NULL; assert_key_lock(sdata->local); if (idx >= 0 && idx < NUM_DEFAULT_KEYS) key = key_mtx_dereference(sdata->local, sdata->keys[idx]); if (uni) { rcu_assign_pointer(sdata->default_unicast_key, key); ieee80211_check_fast_xmit_iface(sdata); if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN) drv_set_default_unicast_key(sdata->local, sdata, idx); } if (multi) rcu_assign_pointer(sdata->default_multicast_key, key); ieee80211_debugfs_key_update_default(sdata); } void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx, bool uni, bool multi) { mutex_lock(&sdata->local->key_mtx); __ieee80211_set_default_key(sdata, idx, uni, multi); mutex_unlock(&sdata->local->key_mtx); } static void __ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_key *key = NULL; assert_key_lock(sdata->local); if (idx >= NUM_DEFAULT_KEYS && idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) key = key_mtx_dereference(sdata->local, sdata->keys[idx]); rcu_assign_pointer(sdata->default_mgmt_key, key); ieee80211_debugfs_key_update_default(sdata); } void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx) { mutex_lock(&sdata->local->key_mtx); __ieee80211_set_default_mgmt_key(sdata, idx); mutex_unlock(&sdata->local->key_mtx); } static void __ieee80211_set_default_beacon_key(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_key *key = NULL; assert_key_lock(sdata->local); if (idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS && idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS) key = key_mtx_dereference(sdata->local, sdata->keys[idx]); rcu_assign_pointer(sdata->default_beacon_key, key); ieee80211_debugfs_key_update_default(sdata); } void ieee80211_set_default_beacon_key(struct ieee80211_sub_if_data *sdata, int idx) { mutex_lock(&sdata->local->key_mtx); __ieee80211_set_default_beacon_key(sdata, idx); mutex_unlock(&sdata->local->key_mtx); } static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, bool pairwise, struct ieee80211_key *old, struct ieee80211_key *new) { int idx; int ret = 0; bool defunikey, defmultikey, defmgmtkey, defbeaconkey; /* caller must provide at least one old/new */ if (WARN_ON(!new && !old)) return 0; if (new) list_add_tail_rcu(&new->list, &sdata->key_list); WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); if (new && sta && pairwise) { /* Unicast rekey needs special handling. With Extended Key ID * old is still NULL for the first rekey. */ ieee80211_pairwise_rekey(old, new); } if (old) { idx = old->conf.keyidx; if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { ieee80211_key_disable_hw_accel(old); if (new) ret = ieee80211_key_enable_hw_accel(new); } } else { /* new must be provided in case old is not */ idx = new->conf.keyidx; if (!new->local->wowlan) ret = ieee80211_key_enable_hw_accel(new); } if (ret) return ret; if (sta) { if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); if (new && !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) _ieee80211_set_tx_key(new, true); } else { rcu_assign_pointer(sta->gtk[idx], new); } /* Only needed for transition from no key -> key. * Still triggers unnecessary when using Extended Key ID * and installing the second key ID the first time. */ if (new && !old) ieee80211_check_fast_rx(sta); } else { defunikey = old && old == key_mtx_dereference(sdata->local, sdata->default_unicast_key); defmultikey = old && old == key_mtx_dereference(sdata->local, sdata->default_multicast_key); defmgmtkey = old && old == key_mtx_dereference(sdata->local, sdata->default_mgmt_key); defbeaconkey = old && old == key_mtx_dereference(sdata->local, sdata->default_beacon_key); if (defunikey && !new) __ieee80211_set_default_key(sdata, -1, true, false); if (defmultikey && !new) __ieee80211_set_default_key(sdata, -1, false, true); if (defmgmtkey && !new) __ieee80211_set_default_mgmt_key(sdata, -1); if (defbeaconkey && !new) __ieee80211_set_default_beacon_key(sdata, -1); rcu_assign_pointer(sdata->keys[idx], new); if (defunikey && new) __ieee80211_set_default_key(sdata, new->conf.keyidx, true, false); if (defmultikey && new) __ieee80211_set_default_key(sdata, new->conf.keyidx, false, true); if (defmgmtkey && new) __ieee80211_set_default_mgmt_key(sdata, new->conf.keyidx); if (defbeaconkey && new) __ieee80211_set_default_beacon_key(sdata, new->conf.keyidx); } if (old) list_del_rcu(&old->list); return 0; } struct ieee80211_key * ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, const u8 *key_data, size_t seq_len, const u8 *seq, const struct ieee80211_cipher_scheme *cs) { struct ieee80211_key *key; int i, j, err; if (WARN_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS)) return ERR_PTR(-EINVAL); key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); if (!key) return ERR_PTR(-ENOMEM); /* * Default to software encryption; we'll later upload the * key to the hardware if possible. */ key->conf.flags = 0; key->flags = 0; key->conf.cipher = cipher; key->conf.keyidx = idx; key->conf.keylen = key_len; switch (cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: key->conf.iv_len = IEEE80211_WEP_IV_LEN; key->conf.icv_len = IEEE80211_WEP_ICV_LEN; break; case WLAN_CIPHER_SUITE_TKIP: key->conf.iv_len = IEEE80211_TKIP_IV_LEN; key->conf.icv_len = IEEE80211_TKIP_ICV_LEN; if (seq) { for (i = 0; i < IEEE80211_NUM_TIDS; i++) { key->u.tkip.rx[i].iv32 = get_unaligned_le32(&seq[2]); key->u.tkip.rx[i].iv16 = get_unaligned_le16(seq); } } spin_lock_init(&key->u.tkip.txlock); break; case WLAN_CIPHER_SUITE_CCMP: key->conf.iv_len = IEEE80211_CCMP_HDR_LEN; key->conf.icv_len = IEEE80211_CCMP_MIC_LEN; if (seq) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) for (j = 0; j < IEEE80211_CCMP_PN_LEN; j++) key->u.ccmp.rx_pn[i][j] = seq[IEEE80211_CCMP_PN_LEN - j - 1]; } /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt( key_data, key_len, IEEE80211_CCMP_MIC_LEN); if (IS_ERR(key->u.ccmp.tfm)) { err = PTR_ERR(key->u.ccmp.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_CCMP_256: key->conf.iv_len = IEEE80211_CCMP_256_HDR_LEN; key->conf.icv_len = IEEE80211_CCMP_256_MIC_LEN; for (i = 0; seq && i < IEEE80211_NUM_TIDS + 1; i++) for (j = 0; j < IEEE80211_CCMP_256_PN_LEN; j++) key->u.ccmp.rx_pn[i][j] = seq[IEEE80211_CCMP_256_PN_LEN - j - 1]; /* Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt( key_data, key_len, IEEE80211_CCMP_256_MIC_LEN); if (IS_ERR(key->u.ccmp.tfm)) { err = PTR_ERR(key->u.ccmp.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: key->conf.iv_len = 0; if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) key->conf.icv_len = sizeof(struct ieee80211_mmie); else key->conf.icv_len = sizeof(struct ieee80211_mmie_16); if (seq) for (j = 0; j < IEEE80211_CMAC_PN_LEN; j++) key->u.aes_cmac.rx_pn[j] = seq[IEEE80211_CMAC_PN_LEN - j - 1]; /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.aes_cmac.tfm = ieee80211_aes_cmac_key_setup(key_data, key_len); if (IS_ERR(key->u.aes_cmac.tfm)) { err = PTR_ERR(key->u.aes_cmac.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: key->conf.iv_len = 0; key->conf.icv_len = sizeof(struct ieee80211_mmie_16); if (seq) for (j = 0; j < IEEE80211_GMAC_PN_LEN; j++) key->u.aes_gmac.rx_pn[j] = seq[IEEE80211_GMAC_PN_LEN - j - 1]; /* Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.aes_gmac.tfm = ieee80211_aes_gmac_key_setup(key_data, key_len); if (IS_ERR(key->u.aes_gmac.tfm)) { err = PTR_ERR(key->u.aes_gmac.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: key->conf.iv_len = IEEE80211_GCMP_HDR_LEN; key->conf.icv_len = IEEE80211_GCMP_MIC_LEN; for (i = 0; seq && i < IEEE80211_NUM_TIDS + 1; i++) for (j = 0; j < IEEE80211_GCMP_PN_LEN; j++) key->u.gcmp.rx_pn[i][j] = seq[IEEE80211_GCMP_PN_LEN - j - 1]; /* Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.gcmp.tfm = ieee80211_aes_gcm_key_setup_encrypt(key_data, key_len); if (IS_ERR(key->u.gcmp.tfm)) { err = PTR_ERR(key->u.gcmp.tfm); kfree(key); return ERR_PTR(err); } break; default: if (cs) { if (seq_len && seq_len != cs->pn_len) { kfree(key); return ERR_PTR(-EINVAL); } key->conf.iv_len = cs->hdr_len; key->conf.icv_len = cs->mic_len; for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) for (j = 0; j < seq_len; j++) key->u.gen.rx_pn[i][j] = seq[seq_len - j - 1]; key->flags |= KEY_FLAG_CIPHER_SCHEME; } } memcpy(key->conf.key, key_data, key_len); INIT_LIST_HEAD(&key->list); return key; } static void ieee80211_key_free_common(struct ieee80211_key *key) { switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: ieee80211_aes_key_free(key->u.ccmp.tfm); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: ieee80211_aes_gmac_key_free(key->u.aes_gmac.tfm); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: ieee80211_aes_gcm_key_free(key->u.gcmp.tfm); break; } kfree_sensitive(key); } static void __ieee80211_key_destroy(struct ieee80211_key *key, bool delay_tailroom) { if (key->local) { struct ieee80211_sub_if_data *sdata = key->sdata; ieee80211_debugfs_key_remove(key); if (delay_tailroom) { /* see ieee80211_delayed_tailroom_dec */ sdata->crypto_tx_tailroom_pending_dec++; schedule_delayed_work(&sdata->dec_tailroom_needed_wk, HZ/2); } else { decrease_tailroom_need_count(sdata, 1); } } ieee80211_key_free_common(key); } static void ieee80211_key_destroy(struct ieee80211_key *key, bool delay_tailroom) { if (!key) return; /* * Synchronize so the TX path and rcu key iterators * can no longer be using this key before we free/remove it. */ synchronize_net(); __ieee80211_key_destroy(key, delay_tailroom); } void ieee80211_key_free_unused(struct ieee80211_key *key) { WARN_ON(key->sdata || key->local); ieee80211_key_free_common(key); } static bool ieee80211_key_identical(struct ieee80211_sub_if_data *sdata, struct ieee80211_key *old, struct ieee80211_key *new) { u8 tkip_old[WLAN_KEY_LEN_TKIP], tkip_new[WLAN_KEY_LEN_TKIP]; u8 *tk_old, *tk_new; if (!old || new->conf.keylen != old->conf.keylen) return false; tk_old = old->conf.key; tk_new = new->conf.key; /* * In station mode, don't compare the TX MIC key, as it's never used * and offloaded rekeying may not care to send it to the host. This * is the case in iwlwifi, for example. */ if (sdata->vif.type == NL80211_IFTYPE_STATION && new->conf.cipher == WLAN_CIPHER_SUITE_TKIP && new->conf.keylen == WLAN_KEY_LEN_TKIP && !(new->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) { memcpy(tkip_old, tk_old, WLAN_KEY_LEN_TKIP); memcpy(tkip_new, tk_new, WLAN_KEY_LEN_TKIP); memset(tkip_old + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8); memset(tkip_new + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8); tk_old = tkip_old; tk_new = tkip_new; } return !crypto_memneq(tk_old, tk_new, new->conf.keylen); } int ieee80211_key_link(struct ieee80211_key *key, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { static atomic_t key_color = ATOMIC_INIT(0); struct ieee80211_key *old_key; int idx = key->conf.keyidx; bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; /* * We want to delay tailroom updates only for station - in that * case it helps roaming speed, but in other cases it hurts and * can cause warnings to appear. */ bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; int ret = -EOPNOTSUPP; mutex_lock(&sdata->local->key_mtx); if (sta && pairwise) { struct ieee80211_key *alt_key; old_key = key_mtx_dereference(sdata->local, sta->ptk[idx]); alt_key = key_mtx_dereference(sdata->local, sta->ptk[idx ^ 1]); /* The rekey code assumes that the old and new key are using * the same cipher. Enforce the assumption for pairwise keys. */ if ((alt_key && alt_key->conf.cipher != key->conf.cipher) || (old_key && old_key->conf.cipher != key->conf.cipher)) goto out; } else if (sta) { old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]); } else { old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]); } /* Non-pairwise keys must also not switch the cipher on rekey */ if (!pairwise) { if (old_key && old_key->conf.cipher != key->conf.cipher) goto out; } /* * Silently accept key re-installation without really installing the * new version of the key to avoid nonce reuse or replay issues. */ if (ieee80211_key_identical(sdata, old_key, key)) { ieee80211_key_free_unused(key); ret = -EALREADY; goto out; } key->local = sdata->local; key->sdata = sdata; key->sta = sta; /* * Assign a unique ID to every key so we can easily prevent mixed * key and fragment cache attacks. */ key->color = atomic_inc_return(&key_color); increment_tailroom_need_count(sdata); ret = ieee80211_key_replace(sdata, sta, pairwise, old_key, key); if (!ret) { ieee80211_debugfs_key_add(key); ieee80211_key_destroy(old_key, delay_tailroom); } else { ieee80211_key_free(key, delay_tailroom); } out: mutex_unlock(&sdata->local->key_mtx); return ret; } void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom) { if (!key) return; /* * Replace key with nothingness if it was ever used. */ if (key->sdata) ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); ieee80211_key_destroy(key, delay_tailroom); } void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata) { struct ieee80211_key *key; struct ieee80211_sub_if_data *vlan; lockdep_assert_wiphy(sdata->local->hw.wiphy); mutex_lock(&sdata->local->key_mtx); sdata->crypto_tx_tailroom_needed_cnt = 0; sdata->crypto_tx_tailroom_pending_dec = 0; if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { vlan->crypto_tx_tailroom_needed_cnt = 0; vlan->crypto_tx_tailroom_pending_dec = 0; } } if (ieee80211_sdata_running(sdata)) { list_for_each_entry(key, &sdata->key_list, list) { increment_tailroom_need_count(sdata); ieee80211_key_enable_hw_accel(key); } } mutex_unlock(&sdata->local->key_mtx); } static void ieee80211_key_iter(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_key *key, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data) { /* skip keys of station in removal process */ if (key->sta && key->sta->removed) return; if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) return; iter(hw, vif, key->sta ? &key->sta->sta : NULL, &key->conf, iter_data); } void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_key *key, *tmp; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(hw->wiphy); mutex_lock(&local->key_mtx); if (vif) { sdata = vif_to_sdata(vif); list_for_each_entry_safe(key, tmp, &sdata->key_list, list) ieee80211_key_iter(hw, vif, key, iter, iter_data); } else { list_for_each_entry(sdata, &local->interfaces, list) list_for_each_entry_safe(key, tmp, &sdata->key_list, list) ieee80211_key_iter(hw, &sdata->vif, key, iter, iter_data); } mutex_unlock(&local->key_mtx); } EXPORT_SYMBOL(ieee80211_iter_keys); static void _ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, struct ieee80211_sub_if_data *sdata, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data) { struct ieee80211_key *key; list_for_each_entry_rcu(key, &sdata->key_list, list) ieee80211_key_iter(hw, &sdata->vif, key, iter, iter_data); } void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; if (vif) { sdata = vif_to_sdata(vif); _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data); } else { list_for_each_entry_rcu(sdata, &local->interfaces, list) _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data); } } EXPORT_SYMBOL(ieee80211_iter_keys_rcu); static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata, struct list_head *keys) { struct ieee80211_key *key, *tmp; decrease_tailroom_need_count(sdata, sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; ieee80211_debugfs_key_remove_mgmt_default(sdata); ieee80211_debugfs_key_remove_beacon_default(sdata); list_for_each_entry_safe(key, tmp, &sdata->key_list, list) { ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); list_add_tail(&key->list, keys); } ieee80211_debugfs_key_update_default(sdata); } void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, bool force_synchronize) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *vlan; struct ieee80211_sub_if_data *master; struct ieee80211_key *key, *tmp; LIST_HEAD(keys); cancel_delayed_work_sync(&sdata->dec_tailroom_needed_wk); mutex_lock(&local->key_mtx); ieee80211_free_keys_iface(sdata, &keys); if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) ieee80211_free_keys_iface(vlan, &keys); } if (!list_empty(&keys) || force_synchronize) synchronize_net(); list_for_each_entry_safe(key, tmp, &keys, list) __ieee80211_key_destroy(key, false); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (sdata->bss) { master = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt != master->crypto_tx_tailroom_needed_cnt); } } else { WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || sdata->crypto_tx_tailroom_pending_dec); } if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || vlan->crypto_tx_tailroom_pending_dec); } mutex_unlock(&local->key_mtx); } void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta) { struct ieee80211_key *key; int i; mutex_lock(&local->key_mtx); for (i = 0; i < ARRAY_SIZE(sta->gtk); i++) { key = key_mtx_dereference(local, sta->gtk[i]); if (!key) continue; ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); __ieee80211_key_destroy(key, key->sdata->vif.type == NL80211_IFTYPE_STATION); } for (i = 0; i < NUM_DEFAULT_KEYS; i++) { key = key_mtx_dereference(local, sta->ptk[i]); if (!key) continue; ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); __ieee80211_key_destroy(key, key->sdata->vif.type == NL80211_IFTYPE_STATION); } mutex_unlock(&local->key_mtx); } void ieee80211_delayed_tailroom_dec(struct work_struct *wk) { struct ieee80211_sub_if_data *sdata; sdata = container_of(wk, struct ieee80211_sub_if_data, dec_tailroom_needed_wk.work); /* * The reason for the delayed tailroom needed decrementing is to * make roaming faster: during roaming, all keys are first deleted * and then new keys are installed. The first new key causes the * crypto_tx_tailroom_needed_cnt to go from 0 to 1, which invokes * the cost of synchronize_net() (which can be slow). Avoid this * by deferring the crypto_tx_tailroom_needed_cnt decrementing on * key removal for a while, so if we roam the value is larger than * zero and no 0->1 transition happens. * * The cost is that if the AP switching was from an AP with keys * to one without, we still allocate tailroom while it would no * longer be needed. However, in the typical (fast) roaming case * within an ESS this usually won't happen. */ mutex_lock(&sdata->local->key_mtx); decrease_tailroom_need_count(sdata, sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; mutex_unlock(&sdata->local->key_mtx); } void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_gtk_rekey_notify(sdata, bssid, replay_ctr); cfg80211_gtk_rekey_notify(sdata->dev, bssid, replay_ctr, gfp); } EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify); void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { struct ieee80211_key *key; const u8 *pn; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS)) return; seq->tkip.iv32 = key->u.tkip.rx[tid].iv32; seq->tkip.iv16 = key->u.tkip.rx[tid].iv16; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.ccmp.rx_pn[tid]; memcpy(seq->ccmp.pn, pn, IEEE80211_CCMP_PN_LEN); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_cmac.rx_pn; memcpy(seq->aes_cmac.pn, pn, IEEE80211_CMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_gmac.rx_pn; memcpy(seq->aes_gmac.pn, pn, IEEE80211_GMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.gcmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.gcmp.rx_pn[tid]; memcpy(seq->gcmp.pn, pn, IEEE80211_GCMP_PN_LEN); break; } } EXPORT_SYMBOL(ieee80211_get_key_rx_seq); void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { struct ieee80211_key *key; u8 *pn; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS)) return; key->u.tkip.rx[tid].iv32 = seq->tkip.iv32; key->u.tkip.rx[tid].iv16 = seq->tkip.iv16; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.ccmp.rx_pn[tid]; memcpy(pn, seq->ccmp.pn, IEEE80211_CCMP_PN_LEN); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_cmac.rx_pn; memcpy(pn, seq->aes_cmac.pn, IEEE80211_CMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_gmac.rx_pn; memcpy(pn, seq->aes_gmac.pn, IEEE80211_GMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.gcmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.gcmp.rx_pn[tid]; memcpy(pn, seq->gcmp.pn, IEEE80211_GCMP_PN_LEN); break; default: WARN_ON(1); break; } } EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq); void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) { struct ieee80211_key *key; key = container_of(keyconf, struct ieee80211_key, conf); assert_key_lock(key->local); /* * if key was uploaded, we assume the driver will/has remove(d) * it, so adjust bookkeeping accordingly */ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(key->sdata); } ieee80211_key_free(key, false); } EXPORT_SYMBOL_GPL(ieee80211_remove_key); struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, struct ieee80211_key_conf *keyconf) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; int err; if (WARN_ON(!local->wowlan)) return ERR_PTR(-EINVAL); if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return ERR_PTR(-EINVAL); key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx, keyconf->keylen, keyconf->key, 0, NULL, NULL); if (IS_ERR(key)) return ERR_CAST(key); if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; err = ieee80211_key_link(key, sdata, NULL); if (err) return ERR_PTR(err); return &key->conf; } EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_add); void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf) { struct ieee80211_key *key; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: key->u.aes_cmac.icverrors++; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: key->u.aes_gmac.icverrors++; break; default: /* ignore the others for now, we don't keep counters now */ break; } } EXPORT_SYMBOL_GPL(ieee80211_key_mic_failure); void ieee80211_key_replay(struct ieee80211_key_conf *keyconf) { struct ieee80211_key *key; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: key->u.ccmp.replays++; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: key->u.aes_cmac.replays++; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: key->u.aes_gmac.replays++; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: key->u.gcmp.replays++; break; } } EXPORT_SYMBOL_GPL(ieee80211_key_replay);
158 148 150 149 149 230 14 3 73 3 90 42 26 26 6 1 12 179 1 77 180 179 193 192 190 9 1 4 186 3 7 2 179 179 33 19 19 20 21 2 2 2 2 42 42 1 3 2 6 33 21 2 187 162 42 2 170 2 8 8 9 7 8 8 19 18 1 3 12 27 3 24 7 20 1 10 3 2 2 1 9 1 9 10 7 29 2 27 4 4 25 25 15 2 41 41 41 15 8 11 16 38 3 41 41 174 9 5 4 226 225 22 58 1 44 14 53 6 186 1 5 13 3 170 1 144 14 11 17 9 5 2 5 10 10 9 6 158 1 7 155 159 157 157 5 158 158 5 5 2 6 153 152 8 150 150 5 145 12 187 155 1 155 3 160 224 226 195 173 180 3 2 5 172 47 9 1 3 160 138 174 3 3 14 14 13 8 4 4 2 2 4 4 2 2 1 1 2 1 2 2 1 11 1 10 7 10 5 5 3 1 1 1 14 14 11 1 2 2 6 8 10 4 8 6 11 3 7 7 11 3 11 3 12 2 27 8 19 2 1 2 16 4 16 16 15 14 4 2 2 2 2 4 2 2 2 2 18 1 17 2 4 11 6 6 7 5 8 7 20 1 10 9 9 1 2 6 2 6 6 2 6 2 2 2 6 1 1 4 1 3 2 1 7 2 1 4 2 2 1 9 1 3 5 8 8 2 4 4 4 3 2 2 1 2 9 1 8 1 1 6 5 2 1 1 1 1 1 13 1 3 9 12 4 3 1 11 8 5 2 2 2 2 10 2 1 7 2 1 4 2 2 1 8 1 7 1 3 3 4 1 1 8 1 7 1 1 5 3 1 2 10 1 9 2 2 5 4 1 2 6 1 2 5 3 21 1 20 9 10 10 4 2 1 5 1 2 9 1 8 8 11 1 1 3 6 9 4 2 4 5 1 6 1 5 5 7 1 1 5 1 1 3 3 1 5 1 1 3 1 4 1 3 1 6 1 5 3 14 1 1 12 3 1 7 4 4 10 1 10 10 7 7 1 6 6 2 2 2 1 2 9 2 7 7 7 1 1 5 5 11 1 6 4 4 10 1 1 3 5 5 535 536 60 337 333 2 37 4 4 25 3 11 3 27 20 3 9 2 6 7 2 4 1 9 3 9 2 13 2 2 10 8 8 10 7 11 10 3 9 11 6 7 5 4 6 14 11 7 7 9 7 11 10 7 7 5 2 104 104 87 17 103 41 40 39 1 41 2 9 9 9 20 12 21 20 18 7 7 7 2 2 2 22 18 17 7 16 11 12 88 186 186 183 161 162 161 1 18 2 13 13 13 6 2 11 11 6 176 176 1 176 6 6 2 179 176 3 24 27 27 27 27 185 5 30 28 3 16 2 3 1 16 2 2 2 4 2 2 9 9 1 4 13 14 14 2 2 11 11 8 2 2 8 9 16 16 15 15 15 14 2 2 13 15 54 6 6 59 59 59 59 59 51 51 5 1 1 3 3 19 20 20 20 1 20 20 6 60 156 156 129 17 2 8 3 71 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * Copyright (c) 2001-2002 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * These functions interface with the sockets layer to implement the * SCTP Extensions for the Sockets API. * * Note that the descriptions from the specification are USER level * functions--this file is the functions which populate the struct proto * for SCTP which is the BOTTOM of the sockets interface. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Narasimha Budihal <narsi@refcode.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <samudrala@us.ibm.com> * Inaky Perez-Gonzalez <inaky.gonzalez@intel.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Anup Pemmaiah <pemmaiah@cc.usu.edu> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/sched/signal.h> #include <linux/ip.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/busy_poll.h> #include <linux/socket.h> /* for sa_family_t */ #include <linux/export.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); static void sctp_wait_for_close(struct sock *sk, long timeo); static void sctp_destruct_sock(struct sock *sk); static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, union sctp_addr *addr, int len); static int sctp_bindx_add(struct sock *, struct sockaddr *, int); static int sctp_bindx_rem(struct sock *, struct sockaddr *, int); static int sctp_send_asconf_add_ip(struct sock *, struct sockaddr *, int); static int sctp_send_asconf_del_ip(struct sock *, struct sockaddr *, int); static int sctp_send_asconf(struct sctp_association *asoc, struct sctp_chunk *chunk); static int sctp_do_bind(struct sock *, union sctp_addr *, int); static int sctp_autobind(struct sock *sk); static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sctp_association *assoc, enum sctp_socket_type type); static unsigned long sctp_memory_pressure; static atomic_long_t sctp_memory_allocated; struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) { WRITE_ONCE(sctp_memory_pressure, 1); } /* Get the sndbuf space available at the time on the association. */ static inline int sctp_wspace(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used : sk_stream_wspace(sk); } /* Increment the used sndbuf space count of the corresponding association by * the size of the outgoing data chunk. * Also, set the skb destructor for sndbuf accounting later. * * Since it is always 1-1 between chunk and skb, and also a new skb is always * allocated for chunk bundling in sctp_packet_transmit(), we can use the * destructor in the data chunk skb for the purpose of the sndbuf space * tracking. */ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sock *sk = asoc->base.sk; /* The sndbuf space is tracked per association. */ sctp_association_hold(asoc); if (chunk->shkey) sctp_auth_shkey_hold(chunk->shkey); skb_set_owner_w(chunk->skb, sk); chunk->skb->destructor = sctp_wfree; /* Save the chunk pointer in skb for sctp_wfree to use later. */ skb_shinfo(chunk->skb)->destructor_arg = chunk; refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk); sk_wmem_queued_add(sk, chunk->skb->truesize + sizeof(struct sctp_chunk)); sk_mem_charge(sk, chunk->skb->truesize); } static void sctp_clear_owner_w(struct sctp_chunk *chunk) { skb_orphan(chunk->skb); } #define traverse_and_process() \ do { \ msg = chunk->msg; \ if (msg == prev_msg) \ continue; \ list_for_each_entry(c, &msg->chunks, frag_list) { \ if ((clear && asoc->base.sk == c->skb->sk) || \ (!clear && asoc->base.sk != c->skb->sk)) \ cb(c); \ } \ prev_msg = msg; \ } while (0) static void sctp_for_each_tx_datachunk(struct sctp_association *asoc, bool clear, void (*cb)(struct sctp_chunk *)) { struct sctp_datamsg *msg, *prev_msg = NULL; struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chunk, *c; struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) list_for_each_entry(chunk, &t->transmitted, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->retransmit, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->sacked, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->abandoned, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->out_chunk_list, list) traverse_and_process(); } static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk, void (*cb)(struct sk_buff *, struct sock *)) { struct sk_buff *skb, *tmp; sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp) cb(skb, sk); sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp) cb(skb, sk); sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp) cb(skb, sk); } /* Verify that this is a valid address. */ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, int len) { struct sctp_af *af; /* Verify basic sockaddr. */ af = sctp_sockaddr_af(sctp_sk(sk), addr, len); if (!af) return -EINVAL; /* Is this a valid SCTP address? */ if (!af->addr_valid(addr, sctp_sk(sk), NULL)) return -EINVAL; if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) return -EINVAL; return 0; } /* Look up the association by its id. If this is not a UDP-style * socket, the ID field is always ignored. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) { struct sctp_association *asoc = NULL; /* If this is not a UDP-style socket, assoc id should be ignored. */ if (!sctp_style(sk, UDP)) { /* Return NULL if the socket state is not ESTABLISHED. It * could be a TCP-style listening socket or a socket which * hasn't yet called connect() to establish an association. */ if (!sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING)) return NULL; /* Get the first and the only association from the list. */ if (!list_empty(&sctp_sk(sk)->ep->asocs)) asoc = list_entry(sctp_sk(sk)->ep->asocs.next, struct sctp_association, asocs); return asoc; } /* Otherwise this is a UDP-style socket. */ if (id <= SCTP_ALL_ASSOC) return NULL; spin_lock_bh(&sctp_assocs_id_lock); asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id); if (asoc && (asoc->base.sk != sk || asoc->base.dead)) asoc = NULL; spin_unlock_bh(&sctp_assocs_id_lock); return asoc; } /* Look up the transport from an address and an assoc id. If both address and * id are specified, the associations matching the address and the id should be * the same. */ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, struct sockaddr_storage *addr, sctp_assoc_t id) { struct sctp_association *addr_asoc = NULL, *id_asoc = NULL; struct sctp_af *af = sctp_get_af_specific(addr->ss_family); union sctp_addr *laddr = (union sctp_addr *)addr; struct sctp_transport *transport; if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len)) return NULL; addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, laddr, &transport); if (!addr_asoc) return NULL; id_asoc = sctp_id2assoc(sk, id); if (id_asoc && (id_asoc != addr_asoc)) return NULL; sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk), (union sctp_addr *)addr); return transport; } /* API 3.1.2 bind() - UDP Style Syntax * The syntax of bind() is, * * ret = bind(int sd, struct sockaddr *addr, int addrlen); * * sd - the socket descriptor returned by socket(). * addr - the address structure (struct sockaddr_in or struct * sockaddr_in6 [RFC 2553]), * addr_len - the size of the address structure. */ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) { int retval = 0; lock_sock(sk); pr_debug("%s: sk:%p, addr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); /* Disallow binding twice. */ if (!sctp_sk(sk)->ep->base.bind_addr.port) retval = sctp_do_bind(sk, (union sctp_addr *)addr, addr_len); else retval = -EINVAL; release_sock(sk); return retval; } static int sctp_get_port_local(struct sock *, union sctp_addr *); /* Verify this is a valid sockaddr. */ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, union sctp_addr *addr, int len) { struct sctp_af *af; /* Check minimum size. */ if (len < sizeof (struct sockaddr)) return NULL; if (!opt->pf->af_supported(addr->sa.sa_family, opt)) return NULL; if (addr->sa.sa_family == AF_INET6) { if (len < SIN6_LEN_RFC2133) return NULL; /* V4 mapped address are really of AF_INET family */ if (ipv6_addr_v4mapped(&addr->v6.sin6_addr) && !opt->pf->af_supported(AF_INET, opt)) return NULL; } /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); if (len < af->sockaddr_len) return NULL; return af; } static void sctp_auto_asconf_init(struct sctp_sock *sp) { struct net *net = sock_net(&sp->inet.sk); if (net->sctp.default_auto_asconf) { spin_lock_bh(&net->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); spin_unlock_bh(&net->sctp.addr_wq_lock); sp->do_auto_asconf = 1; } } /* Bind a local address either to an endpoint or to an association. */ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) { struct net *net = sock_net(sk); struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; struct sctp_bind_addr *bp = &ep->base.bind_addr; struct sctp_af *af; unsigned short snum; int ret = 0; /* Common sockaddr verification. */ af = sctp_sockaddr_af(sp, addr, len); if (!af) { pr_debug("%s: sk:%p, newaddr:%p, len:%d EINVAL\n", __func__, sk, addr, len); return -EINVAL; } snum = ntohs(addr->v4.sin_port); pr_debug("%s: sk:%p, new addr:%pISc, port:%d, new port:%d, len:%d\n", __func__, sk, &addr->sa, bp->port, snum, len); /* PF specific bind() address verification. */ if (!sp->pf->bind_verify(sp, addr)) return -EADDRNOTAVAIL; /* We must either be unbound, or bind to the same port. * It's OK to allow 0 ports if we are already bound. * We'll just inhert an already bound port in this case */ if (bp->port) { if (!snum) snum = bp->port; else if (snum != bp->port) { pr_debug("%s: new port %d doesn't match existing port " "%d\n", __func__, snum, bp->port); return -EINVAL; } } if (snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; /* See if the address matches any of the addresses we may have * already bound before checking against other endpoints. */ if (sctp_bind_addr_match(bp, addr, sp)) return -EINVAL; /* Make sure we are allowed to bind here. * The function sctp_get_port_local() does duplicate address * detection. */ addr->v4.sin_port = htons(snum); if (sctp_get_port_local(sk, addr)) return -EADDRINUSE; /* Refresh ephemeral port. */ if (!bp->port) { bp->port = inet_sk(sk)->inet_num; sctp_auto_asconf_init(sp); } /* Add the address to the bind address list. * Use GFP_ATOMIC since BHs will be disabled. */ ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len, SCTP_ADDR_SRC, GFP_ATOMIC); if (ret) { sctp_put_port(sk); return ret; } /* Copy back into socket for getsockname() use. */ inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num); sp->pf->to_sk_saddr(addr, sk); return ret; } /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks * * R1) One and only one ASCONF Chunk MAY be in transit and unacknowledged * at any one time. If a sender, after sending an ASCONF chunk, decides * it needs to transfer another ASCONF Chunk, it MUST wait until the * ASCONF-ACK Chunk returns from the previous ASCONF Chunk before sending a * subsequent ASCONF. Note this restriction binds each side, so at any * time two ASCONF may be in-transit on any given association (one sent * from each endpoint). */ static int sctp_send_asconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { int retval = 0; /* If there is an outstanding ASCONF chunk, queue it for later * transmission. */ if (asoc->addip_last_asconf) { list_add_tail(&chunk->list, &asoc->addip_chunk_list); goto out; } /* Hold the chunk until an ASCONF_ACK is received. */ sctp_chunk_hold(chunk); retval = sctp_primitive_ASCONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); else asoc->addip_last_asconf = chunk; out: return retval; } /* Add a list of addresses as bind addresses to local endpoint or * association. * * Basically run through each address specified in the addrs/addrcnt * array/length pair, determine if it is IPv6 or IPv4 and call * sctp_do_bind() on it. * * If any of them fails, then the operation will be reversed and the * ones that were added will be removed. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_bindx_add(struct sock *sk, struct sockaddr *addrs, int addrcnt) { int cnt; int retval = 0; void *addr_buf; struct sockaddr *sa_addr; struct sctp_af *af; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); addr_buf = addrs; for (cnt = 0; cnt < addrcnt; cnt++) { /* The list may contain either IPv4 or IPv6 address; * determine the address length for walking thru the list. */ sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); if (!af) { retval = -EINVAL; goto err_bindx_add; } retval = sctp_do_bind(sk, (union sctp_addr *)sa_addr, af->sockaddr_len); addr_buf += af->sockaddr_len; err_bindx_add: if (retval < 0) { /* Failed. Cleanup the ones that have been added */ if (cnt > 0) sctp_bindx_rem(sk, addrs, cnt); return retval; } } return retval; } /* Send an ASCONF chunk with Add IP address parameters to all the peers of the * associations that are part of the endpoint indicating that a list of local * addresses are added to the endpoint. * * If any of the addresses is already in the bind address list of the * association, we do not send the chunk for that association. But it will not * affect other associations. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_send_asconf_add_ip(struct sock *sk, struct sockaddr *addrs, int addrcnt) { struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *asoc; struct sctp_bind_addr *bp; struct sctp_chunk *chunk; struct sctp_sockaddr_entry *laddr; union sctp_addr *addr; union sctp_addr saveaddr; void *addr_buf; struct sctp_af *af; struct list_head *p; int i; int retval = 0; sp = sctp_sk(sk); ep = sp->ep; if (!ep->asconf_enable) return retval; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); list_for_each_entry(asoc, &ep->asocs, asocs) { if (!asoc->peer.asconf_capable) continue; if (asoc->peer.addip_disabled_mask & SCTP_PARAM_ADD_IP) continue; if (!sctp_state(asoc, ESTABLISHED)) continue; /* Check if any address in the packed array of addresses is * in the bind address list of the association. If so, * do not send the asconf chunk to its peer, but continue with * other associations. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); if (!af) { retval = -EINVAL; goto out; } if (sctp_assoc_lookup_laddr(asoc, addr)) break; addr_buf += af->sockaddr_len; } if (i < addrcnt) continue; /* Use the first valid address in bind addr list of * association as Address Parameter of ASCONF CHUNK. */ bp = &asoc->base.bind_addr; p = bp->address_list.next; laddr = list_entry(p, struct sctp_sockaddr_entry, list); chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs, addrcnt, SCTP_PARAM_ADD_IP); if (!chunk) { retval = -ENOMEM; goto out; } /* Add the new addresses to the bind address list with * use_as_src set to 0. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); memcpy(&saveaddr, addr, af->sockaddr_len); retval = sctp_add_bind_addr(bp, &saveaddr, sizeof(saveaddr), SCTP_ADDR_NEW, GFP_ATOMIC); addr_buf += af->sockaddr_len; } if (asoc->src_out_of_asoc_ok) { struct sctp_transport *trans; list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); trans->ssthresh = asoc->peer.i.a_rwnd; trans->rto = asoc->rto_initial; sctp_max_rto(asoc, trans); trans->rtt = trans->srtt = trans->rttvar = 0; /* Clear the source and route cache */ sctp_transport_route(trans, NULL, sctp_sk(asoc->base.sk)); } } retval = sctp_send_asconf(asoc, chunk); } out: return retval; } /* Remove a list of addresses from bind addresses list. Do not remove the * last address. * * Basically run through each address specified in the addrs/addrcnt * array/length pair, determine if it is IPv6 or IPv4 and call * sctp_del_bind() on it. * * If any of them fails, then the operation will be reversed and the * ones that were removed will be added back. * * At least one address has to be left; if only one address is * available, the operation will return -EBUSY. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; int cnt; struct sctp_bind_addr *bp = &ep->base.bind_addr; int retval = 0; void *addr_buf; union sctp_addr *sa_addr; struct sctp_af *af; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); addr_buf = addrs; for (cnt = 0; cnt < addrcnt; cnt++) { /* If the bind address list is empty or if there is only one * bind address, there is nothing more to be removed (we need * at least one address here). */ if (list_empty(&bp->address_list) || (sctp_list_single_entry(&bp->address_list))) { retval = -EBUSY; goto err_bindx_rem; } sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa.sa_family); if (!af) { retval = -EINVAL; goto err_bindx_rem; } if (!af->addr_valid(sa_addr, sp, NULL)) { retval = -EADDRNOTAVAIL; goto err_bindx_rem; } if (sa_addr->v4.sin_port && sa_addr->v4.sin_port != htons(bp->port)) { retval = -EINVAL; goto err_bindx_rem; } if (!sa_addr->v4.sin_port) sa_addr->v4.sin_port = htons(bp->port); /* FIXME - There is probably a need to check if sk->sk_saddr and * sk->sk_rcv_addr are currently set to one of the addresses to * be removed. This is something which needs to be looked into * when we are fixing the outstanding issues with multi-homing * socket routing and failover schemes. Refer to comments in * sctp_do_bind(). -daisy */ retval = sctp_del_bind_addr(bp, sa_addr); addr_buf += af->sockaddr_len; err_bindx_rem: if (retval < 0) { /* Failed. Add the ones that has been removed back */ if (cnt > 0) sctp_bindx_add(sk, addrs, cnt); return retval; } } return retval; } /* Send an ASCONF chunk with Delete IP address parameters to all the peers of * the associations that are part of the endpoint indicating that a list of * local addresses are removed from the endpoint. * * If any of the addresses is already in the bind address list of the * association, we do not send the chunk for that association. But it will not * affect other associations. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_send_asconf_del_ip(struct sock *sk, struct sockaddr *addrs, int addrcnt) { struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *asoc; struct sctp_transport *transport; struct sctp_bind_addr *bp; struct sctp_chunk *chunk; union sctp_addr *laddr; void *addr_buf; struct sctp_af *af; struct sctp_sockaddr_entry *saddr; int i; int retval = 0; int stored = 0; chunk = NULL; sp = sctp_sk(sk); ep = sp->ep; if (!ep->asconf_enable) return retval; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); list_for_each_entry(asoc, &ep->asocs, asocs) { if (!asoc->peer.asconf_capable) continue; if (asoc->peer.addip_disabled_mask & SCTP_PARAM_DEL_IP) continue; if (!sctp_state(asoc, ESTABLISHED)) continue; /* Check if any address in the packed array of addresses is * not present in the bind address list of the association. * If so, do not send the asconf chunk to its peer, but * continue with other associations. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { laddr = addr_buf; af = sctp_get_af_specific(laddr->v4.sin_family); if (!af) { retval = -EINVAL; goto out; } if (!sctp_assoc_lookup_laddr(asoc, laddr)) break; addr_buf += af->sockaddr_len; } if (i < addrcnt) continue; /* Find one address in the association's bind address list * that is not in the packed array of addresses. This is to * make sure that we do not delete all the addresses in the * association. */ bp = &asoc->base.bind_addr; laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs, addrcnt, sp); if ((laddr == NULL) && (addrcnt == 1)) { if (asoc->asconf_addr_del_pending) continue; asoc->asconf_addr_del_pending = kzalloc(sizeof(union sctp_addr), GFP_ATOMIC); if (asoc->asconf_addr_del_pending == NULL) { retval = -ENOMEM; goto out; } asoc->asconf_addr_del_pending->sa.sa_family = addrs->sa_family; asoc->asconf_addr_del_pending->v4.sin_port = htons(bp->port); if (addrs->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addrs; asoc->asconf_addr_del_pending->v4.sin_addr.s_addr = sin->sin_addr.s_addr; } else if (addrs->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addrs; asoc->asconf_addr_del_pending->v6.sin6_addr = sin6->sin6_addr; } pr_debug("%s: keep the last address asoc:%p %pISc at %p\n", __func__, asoc, &asoc->asconf_addr_del_pending->sa, asoc->asconf_addr_del_pending); asoc->src_out_of_asoc_ok = 1; stored = 1; goto skip_mkasconf; } if (laddr == NULL) return -EINVAL; /* We do not need RCU protection throughout this loop * because this is done under a socket lock from the * setsockopt call. */ chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt, SCTP_PARAM_DEL_IP); if (!chunk) { retval = -ENOMEM; goto out; } skip_mkasconf: /* Reset use_as_src flag for the addresses in the bind address * list that are to be deleted. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { laddr = addr_buf; af = sctp_get_af_specific(laddr->v4.sin_family); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, laddr)) saddr->state = SCTP_ADDR_DEL; } addr_buf += af->sockaddr_len; } /* Update the route and saddr entries for all the transports * as some of the addresses in the bind address list are * about to be deleted and cannot be used as source addresses. */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_route(transport, NULL, sctp_sk(asoc->base.sk)); } if (stored) /* We don't need to transmit ASCONF */ continue; retval = sctp_send_asconf(asoc, chunk); } out: return retval; } /* set addr events to assocs in the endpoint. ep and addr_wq must be locked */ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) { struct sock *sk = sctp_opt2sk(sp); union sctp_addr *addr; struct sctp_af *af; /* It is safe to write port space in caller. */ addr = &addrw->a; addr->v4.sin_port = htons(sp->ep->base.bind_addr.port); af = sctp_get_af_specific(addr->sa.sa_family); if (!af) return -EINVAL; if (sctp_verify_addr(sk, addr, af->sockaddr_len)) return -EINVAL; if (addrw->state == SCTP_ADDR_NEW) return sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1); else return sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1); } /* Helper for tunneling sctp_bindx() requests through sctp_setsockopt() * * API 8.1 * int sctp_bindx(int sd, struct sockaddr *addrs, int addrcnt, * int flags); * * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses. * If the sd is an IPv6 socket, the addresses passed can either be IPv4 * or IPv6 addresses. * * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see * Section 3.1.2 for this usage. * * addrs is a pointer to an array of one or more socket addresses. Each * address is contained in its appropriate structure (i.e. struct * sockaddr_in or struct sockaddr_in6) the family of the address type * must be used to distinguish the address length (note that this * representation is termed a "packed array" of addresses). The caller * specifies the number of addresses in the array with addrcnt. * * On success, sctp_bindx() returns 0. On failure, sctp_bindx() returns * -1, and sets errno to the appropriate error code. * * For SCTP, the port given in each socket address must be the same, or * sctp_bindx() will fail, setting errno to EINVAL. * * The flags parameter is formed from the bitwise OR of zero or more of * the following currently defined flags: * * SCTP_BINDX_ADD_ADDR * * SCTP_BINDX_REM_ADDR * * SCTP_BINDX_ADD_ADDR directs SCTP to add the given addresses to the * association, and SCTP_BINDX_REM_ADDR directs SCTP to remove the given * addresses from the association. The two flags are mutually exclusive; * if both are given, sctp_bindx() will fail with EINVAL. A caller may * not remove all addresses from an association; sctp_bindx() will * reject such an attempt with EINVAL. * * An application can use sctp_bindx(SCTP_BINDX_ADD_ADDR) to associate * additional addresses with an endpoint after calling bind(). Or use * sctp_bindx(SCTP_BINDX_REM_ADDR) to remove some addresses a listening * socket is associated with so that no new association accepted will be * associated with those addresses. If the endpoint supports dynamic * address a SCTP_BINDX_REM_ADDR or SCTP_BINDX_ADD_ADDR may cause a * endpoint to send the appropriate message to the peer to change the * peers address lists. * * Adding and removing addresses from a connected association is * optional functionality. Implementations that do not support this * functionality should return EOPNOTSUPP. * * Basically do nothing but copying the addresses from user to kernel * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk. * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace. * * On exit there is no need to do sockfd_put(), sys_setsockopt() does * it. * * sk The sk of the socket * addrs The pointer to the addresses * addrssize Size of the addrs buffer * op Operation to perform (add or remove, see the flags of * sctp_bindx) * * Returns 0 if ok, <0 errno code on error. */ static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs, int addrs_size, int op) { int err; int addrcnt = 0; int walk_size = 0; struct sockaddr *sa_addr; void *addr_buf = addrs; struct sctp_af *af; pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n", __func__, sk, addr_buf, addrs_size, op); if (unlikely(addrs_size <= 0)) return -EINVAL; /* Walk through the addrs buffer and count the number of addresses. */ while (walk_size < addrs_size) { if (walk_size + sizeof(sa_family_t) > addrs_size) return -EINVAL; sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); /* If the address family is not supported or if this address * causes the address buffer to overflow return EINVAL. */ if (!af || (walk_size + af->sockaddr_len) > addrs_size) return -EINVAL; addrcnt++; addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; } /* Do the work. */ switch (op) { case SCTP_BINDX_ADD_ADDR: /* Allow security module to validate bindx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD, addrs, addrs_size); if (err) return err; err = sctp_bindx_add(sk, addrs, addrcnt); if (err) return err; return sctp_send_asconf_add_ip(sk, addrs, addrcnt); case SCTP_BINDX_REM_ADDR: err = sctp_bindx_rem(sk, addrs, addrcnt); if (err) return err; return sctp_send_asconf_del_ip(sk, addrs, addrcnt); default: return -EINVAL; } } static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, int addrlen) { int err; lock_sock(sk); err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR); release_sock(sk); return err; } static int sctp_connect_new_asoc(struct sctp_endpoint *ep, const union sctp_addr *daddr, const struct sctp_initmsg *init, struct sctp_transport **tp) { struct sctp_association *asoc; struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); enum sctp_scope scope; int err; if (sctp_endpoint_is_peeled_off(ep, daddr)) return -EADDRNOTAVAIL; if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) return -EAGAIN; } else { if (inet_port_requires_bind_service(net, ep->base.bind_addr.port) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; } scope = sctp_scope(daddr); asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); if (!asoc) return -ENOMEM; err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL); if (err < 0) goto free; *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); if (!*tp) { err = -ENOMEM; goto free; } if (!init) return 0; if (init->sinit_num_ostreams) { __u16 outcnt = init->sinit_num_ostreams; asoc->c.sinit_num_ostreams = outcnt; /* outcnt has been changed, need to re-init stream */ err = sctp_stream_init(&asoc->stream, outcnt, 0, GFP_KERNEL); if (err) goto free; } if (init->sinit_max_instreams) asoc->c.sinit_max_instreams = init->sinit_max_instreams; if (init->sinit_max_attempts) asoc->max_init_attempts = init->sinit_max_attempts; if (init->sinit_max_init_timeo) asoc->max_init_timeo = msecs_to_jiffies(init->sinit_max_init_timeo); return 0; free: sctp_association_free(asoc); return err; } static int sctp_connect_add_peer(struct sctp_association *asoc, union sctp_addr *daddr, int addr_len) { struct sctp_endpoint *ep = asoc->ep; struct sctp_association *old; struct sctp_transport *t; int err; err = sctp_verify_addr(ep->base.sk, daddr, addr_len); if (err) return err; old = sctp_endpoint_lookup_assoc(ep, daddr, &t); if (old && old != asoc) return old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN : -EALREADY; if (sctp_endpoint_is_peeled_off(ep, daddr)) return -EADDRNOTAVAIL; t = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); if (!t) return -ENOMEM; return 0; } /* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size) * * Common routine for handling connect() and sctp_connectx(). * Connect will come in with just a single address. */ static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs, int addrs_size, int flags, sctp_assoc_t *assoc_id) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; struct sctp_transport *transport; struct sctp_association *asoc; void *addr_buf = kaddrs; union sctp_addr *daddr; struct sctp_af *af; int walk_size, err; long timeo; if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) || (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) return -EISCONN; daddr = addr_buf; af = sctp_get_af_specific(daddr->sa.sa_family); if (!af || af->sockaddr_len > addrs_size) return -EINVAL; err = sctp_verify_addr(sk, daddr, af->sockaddr_len); if (err) return err; asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); if (asoc) return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN : -EALREADY; err = sctp_connect_new_asoc(ep, daddr, NULL, &transport); if (err) return err; asoc = transport->asoc; addr_buf += af->sockaddr_len; walk_size = af->sockaddr_len; while (walk_size < addrs_size) { err = -EINVAL; if (walk_size + sizeof(sa_family_t) > addrs_size) goto out_free; daddr = addr_buf; af = sctp_get_af_specific(daddr->sa.sa_family); if (!af || af->sockaddr_len + walk_size > addrs_size) goto out_free; if (asoc->peer.port != ntohs(daddr->v4.sin_port)) goto out_free; err = sctp_connect_add_peer(asoc, daddr, af->sockaddr_len); if (err) goto out_free; addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; } /* In case the user of sctp_connectx() wants an association * id back, assign one now. */ if (assoc_id) { err = sctp_assoc_set_id(asoc, GFP_KERNEL); if (err < 0) goto out_free; } err = sctp_primitive_ASSOCIATE(sock_net(sk), asoc, NULL); if (err < 0) goto out_free; /* Initialize sk's dport and daddr for getpeername() */ inet_sk(sk)->inet_dport = htons(asoc->peer.port); sp->pf->to_sk_daddr(daddr, sk); sk->sk_err = 0; if (assoc_id) *assoc_id = asoc->assoc_id; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); return sctp_wait_for_connect(asoc, &timeo); out_free: pr_debug("%s: took out_free path with asoc:%p kaddrs:%p err:%d\n", __func__, asoc, kaddrs, err); sctp_association_free(asoc); return err; } /* Helper for tunneling sctp_connectx() requests through sctp_setsockopt() * * API 8.9 * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt, * sctp_assoc_t *asoc); * * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses. * If the sd is an IPv6 socket, the addresses passed can either be IPv4 * or IPv6 addresses. * * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see * Section 3.1.2 for this usage. * * addrs is a pointer to an array of one or more socket addresses. Each * address is contained in its appropriate structure (i.e. struct * sockaddr_in or struct sockaddr_in6) the family of the address type * must be used to distengish the address length (note that this * representation is termed a "packed array" of addresses). The caller * specifies the number of addresses in the array with addrcnt. * * On success, sctp_connectx() returns 0. It also sets the assoc_id to * the association id of the new association. On failure, sctp_connectx() * returns -1, and sets errno to the appropriate error code. The assoc_id * is not touched by the kernel. * * For SCTP, the port given in each socket address must be the same, or * sctp_connectx() will fail, setting errno to EINVAL. * * An application can use sctp_connectx to initiate an association with * an endpoint that is multi-homed. Much like sctp_bindx() this call * allows a caller to specify multiple addresses at which a peer can be * reached. The way the SCTP stack uses the list of addresses to set up * the association is implementation dependent. This function only * specifies that the stack will try to make use of all the addresses in * the list when needed. * * Note that the list of addresses passed in is only used for setting up * the association. It does not necessarily equal the set of addresses * the peer uses for the resulting association. If the caller wants to * find out the set of peer addresses, it must use sctp_getpaddrs() to * retrieve them after the association has been set up. * * Basically do nothing but copying the addresses from user to kernel * land and invoking either sctp_connectx(). This is used for tunneling * the sctp_connectx() request through sctp_setsockopt() from userspace. * * On exit there is no need to do sockfd_put(), sys_setsockopt() does * it. * * sk The sk of the socket * addrs The pointer to the addresses * addrssize Size of the addrs buffer * * Returns >=0 if ok, <0 errno code on error. */ static int __sctp_setsockopt_connectx(struct sock *sk, struct sockaddr *kaddrs, int addrs_size, sctp_assoc_t *assoc_id) { int err = 0, flags = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", __func__, sk, kaddrs, addrs_size); /* make sure the 1st addr's sa_family is accessible later */ if (unlikely(addrs_size < sizeof(sa_family_t))) return -EINVAL; /* Allow security module to validate connectx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_CONNECTX, (struct sockaddr *)kaddrs, addrs_size); if (err) return err; /* in-kernel sockets don't generally have a file allocated to them * if all they do is call sock_create_kern(). */ if (sk->sk_socket->file) flags = sk->sk_socket->file->f_flags; return __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id); } /* * This is an older interface. It's kept for backward compatibility * to the option that doesn't provide association id. */ static int sctp_setsockopt_connectx_old(struct sock *sk, struct sockaddr *kaddrs, int addrs_size) { return __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, NULL); } /* * New interface for the API. The since the API is done with a socket * option, to make it simple we feed back the association id is as a return * indication to the call. Error is always negative and association id is * always positive. */ static int sctp_setsockopt_connectx(struct sock *sk, struct sockaddr *kaddrs, int addrs_size) { sctp_assoc_t assoc_id = 0; int err = 0; err = __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, &assoc_id); if (err) return err; else return assoc_id; } /* * New (hopefully final) interface for the API. * We use the sctp_getaddrs_old structure so that use-space library * can avoid any unnecessary allocations. The only different part * is that we store the actual length of the address buffer into the * addrs_num structure member. That way we can re-use the existing * code. */ #ifdef CONFIG_COMPAT struct compat_sctp_getaddrs_old { sctp_assoc_t assoc_id; s32 addr_num; compat_uptr_t addrs; /* struct sockaddr * */ }; #endif static int sctp_getsockopt_connectx3(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_getaddrs_old param; sctp_assoc_t assoc_id = 0; struct sockaddr *kaddrs; int err = 0; #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_sctp_getaddrs_old param32; if (len < sizeof(param32)) return -EINVAL; if (copy_from_user(&param32, optval, sizeof(param32))) return -EFAULT; param.assoc_id = param32.assoc_id; param.addr_num = param32.addr_num; param.addrs = compat_ptr(param32.addrs); } else #endif { if (len < sizeof(param)) return -EINVAL; if (copy_from_user(&param, optval, sizeof(param))) return -EFAULT; } kaddrs = memdup_user(param.addrs, param.addr_num); if (IS_ERR(kaddrs)) return PTR_ERR(kaddrs); err = __sctp_setsockopt_connectx(sk, kaddrs, param.addr_num, &assoc_id); kfree(kaddrs); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) return -EFAULT; if (put_user(sizeof(assoc_id), optlen)) return -EFAULT; } return err; } /* API 3.1.4 close() - UDP Style Syntax * Applications use close() to perform graceful shutdown (as described in * Section 10.1 of [SCTP]) on ALL the associations currently represented * by a UDP-style socket. * * The syntax is * * ret = close(int sd); * * sd - the socket descriptor of the associations to be closed. * * To gracefully shutdown a specific association represented by the * UDP-style socket, an application should use the sendmsg() call, * passing no user data, but including the appropriate flag in the * ancillary data (see Section xxxx). * * If sd in the close() call is a branched-off socket representing only * one association, the shutdown is performed on that association only. * * 4.1.6 close() - TCP Style Syntax * * Applications use close() to gracefully close down an association. * * The syntax is: * * int close(int sd); * * sd - the socket descriptor of the association to be closed. * * After an application calls close() on a socket descriptor, no further * socket operations will succeed on that descriptor. * * API 7.1.4 SO_LINGER * * An application using the TCP-style socket can use this option to * perform the SCTP ABORT primitive. The linger option structure is: * * struct linger { * int l_onoff; // option on/off * int l_linger; // linger time * }; * * To enable the option, set l_onoff to 1. If the l_linger value is set * to 0, calling close() is the same as the ABORT primitive. If the * value is set to a negative value, the setsockopt() call will return * an error. If the value is set to a positive value linger_time, the * close() can be blocked for at most linger_time ms. If the graceful * shutdown phase does not finish during this period, close() will * return but the graceful shutdown phase continues in the system. */ static void sctp_close(struct sock *sk, long timeout) { struct net *net = sock_net(sk); struct sctp_endpoint *ep; struct sctp_association *asoc; struct list_head *pos, *temp; unsigned int data_was_unread; pr_debug("%s: sk:%p, timeout:%ld\n", __func__, sk, timeout); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); sk->sk_shutdown = SHUTDOWN_MASK; inet_sk_set_state(sk, SCTP_SS_CLOSING); ep = sctp_sk(sk)->ep; /* Clean up any skbs sitting on the receive queue. */ data_was_unread = sctp_queue_purge_ulpevents(&sk->sk_receive_queue); data_was_unread += sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby); /* Walk all associations on an endpoint. */ list_for_each_safe(pos, temp, &ep->asocs) { asoc = list_entry(pos, struct sctp_association, asocs); if (sctp_style(sk, TCP)) { /* A closed association can still be in the list if * it belongs to a TCP-style listening socket that is * not yet accepted. If so, free it. If not, send an * ABORT or SHUTDOWN based on the linger options. */ if (sctp_state(asoc, CLOSED)) { sctp_association_free(asoc); continue; } } if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) || !skb_queue_empty(&asoc->ulpq.reasm) || !skb_queue_empty(&asoc->ulpq.reasm_uo) || (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { struct sctp_chunk *chunk; chunk = sctp_make_abort_user(asoc, NULL, 0); sctp_primitive_ABORT(net, asoc, chunk); } else sctp_primitive_SHUTDOWN(net, asoc, NULL); } /* On a TCP-style socket, block for at most linger_time if set. */ if (sctp_style(sk, TCP) && timeout) sctp_wait_for_close(sk, timeout); /* This will run the backlog queue. */ release_sock(sk); /* Supposedly, no process has access to the socket, but * the net layers still may. * Also, sctp_destroy_sock() needs to be called with addr_wq_lock * held and that should be grabbed before socket lock. */ spin_lock_bh(&net->sctp.addr_wq_lock); bh_lock_sock_nested(sk); /* Hold the sock, since sk_common_release() will put sock_put() * and we have just a little more cleanup. */ sock_hold(sk); sk_common_release(sk); bh_unlock_sock(sk); spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); SCTP_DBG_OBJCNT_DEC(sock); } /* Handle EPIPE error. */ static int sctp_error(struct sock *sk, int flags, int err) { if (err == -EPIPE) err = sock_error(sk) ? : -EPIPE; if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); return err; } /* API 3.1.3 sendmsg() - UDP Style Syntax * * An application uses sendmsg() and recvmsg() calls to transmit data to * and receive data from its peer. * * ssize_t sendmsg(int socket, const struct msghdr *message, * int flags); * * socket - the socket descriptor of the endpoint. * message - pointer to the msghdr structure which contains a single * user message and possibly some ancillary data. * * See Section 5 for complete description of the data * structures. * * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. * * Note: This function could use a rewrite especially when explicit * connect support comes in. */ /* BUG: We do not implement the equivalent of sk_stream_wait_memory(). */ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs); static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs, struct sctp_sndrcvinfo *srinfo, const struct msghdr *msg, size_t msg_len) { __u16 sflags; int err; if (sctp_sstate(sk, LISTENING) && sctp_style(sk, TCP)) return -EPIPE; if (msg_len > sk->sk_sndbuf) return -EMSGSIZE; memset(cmsgs, 0, sizeof(*cmsgs)); err = sctp_msghdr_parse(msg, cmsgs); if (err) { pr_debug("%s: msghdr parse err:%x\n", __func__, err); return err; } memset(srinfo, 0, sizeof(*srinfo)); if (cmsgs->srinfo) { srinfo->sinfo_stream = cmsgs->srinfo->sinfo_stream; srinfo->sinfo_flags = cmsgs->srinfo->sinfo_flags; srinfo->sinfo_ppid = cmsgs->srinfo->sinfo_ppid; srinfo->sinfo_context = cmsgs->srinfo->sinfo_context; srinfo->sinfo_assoc_id = cmsgs->srinfo->sinfo_assoc_id; srinfo->sinfo_timetolive = cmsgs->srinfo->sinfo_timetolive; } if (cmsgs->sinfo) { srinfo->sinfo_stream = cmsgs->sinfo->snd_sid; srinfo->sinfo_flags = cmsgs->sinfo->snd_flags; srinfo->sinfo_ppid = cmsgs->sinfo->snd_ppid; srinfo->sinfo_context = cmsgs->sinfo->snd_context; srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id; } if (cmsgs->prinfo) { srinfo->sinfo_timetolive = cmsgs->prinfo->pr_value; SCTP_PR_SET_POLICY(srinfo->sinfo_flags, cmsgs->prinfo->pr_policy); } sflags = srinfo->sinfo_flags; if (!sflags && msg_len) return 0; if (sctp_style(sk, TCP) && (sflags & (SCTP_EOF | SCTP_ABORT))) return -EINVAL; if (((sflags & SCTP_EOF) && msg_len > 0) || (!(sflags & (SCTP_EOF | SCTP_ABORT)) && msg_len == 0)) return -EINVAL; if ((sflags & SCTP_ADDR_OVER) && !msg->msg_name) return -EINVAL; return 0; } static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_cmsgs *cmsgs, union sctp_addr *daddr, struct sctp_transport **tp) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; struct cmsghdr *cmsg; __be32 flowinfo = 0; struct sctp_af *af; int err; *tp = NULL; if (sflags & (SCTP_EOF | SCTP_ABORT)) return -EINVAL; if (sctp_style(sk, TCP) && (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING))) return -EADDRNOTAVAIL; /* Label connection socket for first association 1-to-many * style for client sequence socket()->sendmsg(). This * needs to be done before sctp_assoc_add_peer() as that will * set up the initial packet that needs to account for any * security ip options (CIPSO/CALIPSO) added to the packet. */ af = sctp_get_af_specific(daddr->sa.sa_family); if (!af) return -EINVAL; err = security_sctp_bind_connect(sk, SCTP_SENDMSG_CONNECT, (struct sockaddr *)daddr, af->sockaddr_len); if (err < 0) return err; err = sctp_connect_new_asoc(ep, daddr, cmsgs->init, tp); if (err) return err; asoc = (*tp)->asoc; if (!cmsgs->addrs_msg) return 0; if (daddr->sa.sa_family == AF_INET6) flowinfo = daddr->v6.sin6_flowinfo; /* sendv addr list parse */ for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { union sctp_addr _daddr; int dlen; if (cmsg->cmsg_level != IPPROTO_SCTP || (cmsg->cmsg_type != SCTP_DSTADDRV4 && cmsg->cmsg_type != SCTP_DSTADDRV6)) continue; daddr = &_daddr; memset(daddr, 0, sizeof(*daddr)); dlen = cmsg->cmsg_len - sizeof(struct cmsghdr); if (cmsg->cmsg_type == SCTP_DSTADDRV4) { if (dlen < sizeof(struct in_addr)) { err = -EINVAL; goto free; } dlen = sizeof(struct in_addr); daddr->v4.sin_family = AF_INET; daddr->v4.sin_port = htons(asoc->peer.port); memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen); } else { if (dlen < sizeof(struct in6_addr)) { err = -EINVAL; goto free; } dlen = sizeof(struct in6_addr); daddr->v6.sin6_flowinfo = flowinfo; daddr->v6.sin6_family = AF_INET6; daddr->v6.sin6_port = htons(asoc->peer.port); memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); } err = sctp_connect_add_peer(asoc, daddr, sizeof(*daddr)); if (err) goto free; } return 0; free: sctp_association_free(asoc); return err; } static int sctp_sendmsg_check_sflags(struct sctp_association *asoc, __u16 sflags, struct msghdr *msg, size_t msg_len) { struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) return -EPIPE; if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP) && !sctp_state(asoc, ESTABLISHED)) return 0; if (sflags & SCTP_EOF) { pr_debug("%s: shutting down association:%p\n", __func__, asoc); sctp_primitive_SHUTDOWN(net, asoc, NULL); return 0; } if (sflags & SCTP_ABORT) { struct sctp_chunk *chunk; chunk = sctp_make_abort_user(asoc, msg, msg_len); if (!chunk) return -ENOMEM; pr_debug("%s: aborting association:%p\n", __func__, asoc); sctp_primitive_ABORT(net, asoc, chunk); iov_iter_revert(&msg->msg_iter, msg_len); return 0; } return 1; } static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, struct msghdr *msg, size_t msg_len, struct sctp_transport *transport, struct sctp_sndrcvinfo *sinfo) { struct sock *sk = asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct net *net = sock_net(sk); struct sctp_datamsg *datamsg; bool wait_connect = false; struct sctp_chunk *chunk; long timeo; int err; if (sinfo->sinfo_stream >= asoc->stream.outcnt) { err = -EINVAL; goto err; } if (unlikely(!SCTP_SO(&asoc->stream, sinfo->sinfo_stream)->ext)) { err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); if (err) goto err; } if (sp->disable_fragments && msg_len > asoc->frag_point) { err = -EMSGSIZE; goto err; } if (asoc->pmtu_pending) { if (sp->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } if (sctp_wspace(asoc) < (int)msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); if (sk_under_memory_pressure(sk)) sk_mem_reclaim(sk); if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) goto err; if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) { err = -EINVAL; goto err; } } if (sctp_state(asoc, CLOSED)) { err = sctp_primitive_ASSOCIATE(net, asoc, NULL); if (err) goto err; if (asoc->ep->intl_enable) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); if (err) { err = -ESRCH; goto err; } } else { wait_connect = true; } pr_debug("%s: we associated primitively\n", __func__); } datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter); if (IS_ERR(datamsg)) { err = PTR_ERR(datamsg); goto err; } asoc->force_delay = !!(msg->msg_flags & MSG_MORE); list_for_each_entry(chunk, &datamsg->chunks, frag_list) { sctp_chunk_hold(chunk); sctp_set_owner_w(chunk); chunk->transport = transport; } err = sctp_primitive_SEND(net, asoc, datamsg); if (err) { sctp_datamsg_free(datamsg); goto err; } pr_debug("%s: we sent primitively\n", __func__); sctp_datamsg_put(datamsg); if (unlikely(wait_connect)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); sctp_wait_for_connect(asoc, &timeo); } err = msg_len; err: return err; } static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk, const struct msghdr *msg, struct sctp_cmsgs *cmsgs) { union sctp_addr *daddr = NULL; int err; if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { int len = msg->msg_namelen; if (len > sizeof(*daddr)) len = sizeof(*daddr); daddr = (union sctp_addr *)msg->msg_name; err = sctp_verify_addr(sk, daddr, len); if (err) return ERR_PTR(err); } return daddr; } static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct sctp_cmsgs *cmsgs) { if (!cmsgs->srinfo && !cmsgs->sinfo) { sinfo->sinfo_stream = asoc->default_stream; sinfo->sinfo_ppid = asoc->default_ppid; sinfo->sinfo_context = asoc->default_context; sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); if (!cmsgs->prinfo) sinfo->sinfo_flags = asoc->default_flags; } if (!cmsgs->srinfo && !cmsgs->prinfo) sinfo->sinfo_timetolive = asoc->default_timetolive; if (cmsgs->authinfo) { /* Reuse sinfo_tsn to indicate that authinfo was set and * sinfo_ssn to save the keyid on tx path. */ sinfo->sinfo_tsn = 1; sinfo->sinfo_ssn = cmsgs->authinfo->auth_keynumber; } } static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_transport *transport = NULL; struct sctp_sndrcvinfo _sinfo, *sinfo; struct sctp_association *asoc, *tmp; struct sctp_cmsgs cmsgs; union sctp_addr *daddr; bool new = false; __u16 sflags; int err; /* Parse and get snd_info */ err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len); if (err) goto out; sinfo = &_sinfo; sflags = sinfo->sinfo_flags; /* Get daddr from msg */ daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs); if (IS_ERR(daddr)) { err = PTR_ERR(daddr); goto out; } lock_sock(sk); /* SCTP_SENDALL process */ if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) { list_for_each_entry_safe(asoc, tmp, &ep->asocs, asocs) { err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err == 0) continue; if (err < 0) goto out_unlock; sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, NULL, sinfo); if (err < 0) goto out_unlock; iov_iter_revert(&msg->msg_iter, err); } goto out_unlock; } /* Get and check or create asoc */ if (daddr) { asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); if (asoc) { err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err <= 0) goto out_unlock; } else { err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr, &transport); if (err) goto out_unlock; asoc = transport->asoc; new = true; } if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER)) transport = NULL; } else { asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id); if (!asoc) { err = -EPIPE; goto out_unlock; } err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err <= 0) goto out_unlock; } /* Update snd_info with the asoc */ sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); /* Send msg to the asoc */ err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo); if (err < 0 && err != -ESRCH && new) sctp_association_free(asoc); out_unlock: release_sock(sk); out: return sctp_error(sk, msg->msg_flags, err); } /* This is an extended version of skb_pull() that removes the data from the * start of a skb even when data is spread across the list of skb's in the * frag_list. len specifies the total amount of data that needs to be removed. * when 'len' bytes could be removed from the skb, it returns 0. * If 'len' exceeds the total skb length, it returns the no. of bytes that * could not be removed. */ static int sctp_skb_pull(struct sk_buff *skb, int len) { struct sk_buff *list; int skb_len = skb_headlen(skb); int rlen; if (len <= skb_len) { __skb_pull(skb, len); return 0; } len -= skb_len; __skb_pull(skb, skb_len); skb_walk_frags(skb, list) { rlen = sctp_skb_pull(list, len); skb->len -= (len-rlen); skb->data_len -= (len-rlen); if (!rlen) return 0; len = rlen; } return len; } /* API 3.1.3 recvmsg() - UDP Style Syntax * * ssize_t recvmsg(int socket, struct msghdr *message, * int flags); * * socket - the socket descriptor of the endpoint. * message - pointer to the msghdr structure which contains a single * user message and possibly some ancillary data. * * See Section 5 for complete description of the data * structures. * * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. */ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct sctp_ulpevent *event = NULL; struct sctp_sock *sp = sctp_sk(sk); struct sk_buff *skb, *head_skb; int copied; int err = 0; int skb_len; pr_debug("%s: sk:%p, msghdr:%p, len:%zd, noblock:%d, flags:0x%x, " "addr_len:%p)\n", __func__, sk, msg, len, noblock, flags, addr_len); lock_sock(sk); if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING) && !sctp_sstate(sk, CLOSED)) { err = -ENOTCONN; goto out; } skb = sctp_skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; /* Get the total length of the skb including any skb's in the * frag_list. */ skb_len = skb->len; copied = skb_len; if (copied > len) copied = len; err = skb_copy_datagram_msg(skb, 0, msg, copied); event = sctp_skb2event(skb); if (err) goto out_free; if (event->chunk && event->chunk->head_skb) head_skb = event->chunk->head_skb; else head_skb = skb; sock_recv_ts_and_drops(msg, sk, head_skb); if (sctp_ulpevent_is_notification(event)) { msg->msg_flags |= MSG_NOTIFICATION; sp->pf->event_msgname(event, msg->msg_name, addr_len); } else { sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len); } /* Check if we allow SCTP_NXTINFO. */ if (sp->recvnxtinfo) sctp_ulpevent_read_nxtinfo(event, msg, sk); /* Check if we allow SCTP_RCVINFO. */ if (sp->recvrcvinfo) sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT)) sctp_ulpevent_read_sndrcvinfo(event, msg); err = copied; /* If skb's length exceeds the user's buffer, update the skb and * push it back to the receive_queue so that the next call to * recvmsg() will return the remaining data. Don't set MSG_EOR. */ if (skb_len > copied) { msg->msg_flags &= ~MSG_EOR; if (flags & MSG_PEEK) goto out_free; sctp_skb_pull(skb, copied); skb_queue_head(&sk->sk_receive_queue, skb); /* When only partial message is copied to the user, increase * rwnd by that amount. If all the data in the skb is read, * rwnd is updated when the event is freed. */ if (!sctp_ulpevent_is_notification(event)) sctp_assoc_rwnd_increase(event->asoc, copied); goto out; } else if ((event->msg_flags & MSG_NOTIFICATION) || (event->msg_flags & MSG_EOR)) msg->msg_flags |= MSG_EOR; else msg->msg_flags &= ~MSG_EOR; out_free: if (flags & MSG_PEEK) { /* Release the skb reference acquired after peeking the skb in * sctp_skb_recv_datagram(). */ kfree_skb(skb); } else { /* Free the event which includes releasing the reference to * the owner of the skb, freeing the skb and updating the * rwnd. */ sctp_ulpevent_free(event); } out: release_sock(sk); return err; } /* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) * * This option is a on/off flag. If enabled no SCTP message * fragmentation will be performed. Instead if a message being sent * exceeds the current PMTU size, the message will NOT be sent and * instead a error will be indicated to the user. */ static int sctp_setsockopt_disable_fragments(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->disable_fragments = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_events(struct sock *sk, __u8 *sn_type, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int i; if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; for (i = 0; i < optlen; i++) sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i, sn_type[i]); list_for_each_entry(asoc, &sp->ep->asocs, asocs) asoc->subscribe = sctp_sk(sk)->subscribe; /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. */ if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) { struct sctp_ulpevent *event; asoc = sctp_id2assoc(sk, 0); if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; asoc->stream.si->enqueue_event(&asoc->ulpq, event); } } return 0; } /* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) * * This socket option is applicable to the UDP-style socket only. When * set it will cause associations that are idle for more than the * specified number of seconds to automatically close. An association * being idle is defined an association that has NOT sent or received * user data. The special value of '0' indicates that no automatic * close of any associations should be performed. The option expects an * integer defining the number of seconds of idle time before an * association is closed. */ static int sctp_setsockopt_autoclose(struct sock *sk, u32 *optval, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct net *net = sock_net(sk); /* Applicable to UDP-style socket only */ if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (optlen != sizeof(int)) return -EINVAL; sp->autoclose = *optval; if (sp->autoclose > net->sctp.max_autoclose) sp->autoclose = net->sctp.max_autoclose; return 0; } /* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) * * Applications can enable or disable heartbeats for any peer address of * an association, modify an address's heartbeat interval, force a * heartbeat to be sent immediately, and adjust the address's maximum * number of retransmissions sent before an address is considered * unreachable. The following structure is used to access and modify an * address's parameters: * * struct sctp_paddrparams { * sctp_assoc_t spp_assoc_id; * struct sockaddr_storage spp_address; * uint32_t spp_hbinterval; * uint16_t spp_pathmaxrxt; * uint32_t spp_pathmtu; * uint32_t spp_sackdelay; * uint32_t spp_flags; * uint32_t spp_ipv6_flowlabel; * uint8_t spp_dscp; * }; * * spp_assoc_id - (one-to-many style socket) This is filled in the * application, and identifies the association for * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, * in milliseconds. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be * considered unreachable. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmtu - When Path MTU discovery is disabled the value * specified here will be the "fixed" path mtu. * Note that if the spp_address field is empty * then all associations on this address will * have this fixed path mtu set upon them. * * spp_sackdelay - When delayed sack is enabled, this value specifies * the number of milliseconds that sacks will be delayed * for. This value will apply to all addresses of an * association if the spp_address field is empty. Note * also, that if delayed sack is enabled and this * value is set to 0, no change is made to the last * recorded delayed sack timer value. * * spp_flags - These flags are used to control various features * on an association. The flag field may contain * zero or more of the following options. * * SPP_HB_ENABLE - Enable heartbeats on the * specified address. Note that if the address * field is empty all addresses for the association * have heartbeats enabled upon them. * * SPP_HB_DISABLE - Disable heartbeats on the * speicifed address. Note that if the address * field is empty all addresses for the association * will have their heartbeats disabled. Note also * that SPP_HB_ENABLE and SPP_HB_DISABLE are * mutually exclusive, only one of these two should * be specified. Enabling both fields will have * undetermined results. * * SPP_HB_DEMAND - Request a user initiated heartbeat * to be made immediately. * * SPP_HB_TIME_IS_ZERO - Specify's that the time for * heartbeat delayis to be set to the value of 0 * milliseconds. * * SPP_PMTUD_ENABLE - This field will enable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. * * SPP_PMTUD_DISABLE - This field will disable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. Not also that * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually * exclusive. Enabling both will have undetermined * results. * * SPP_SACKDELAY_ENABLE - Setting this flag turns * on delayed sack. The time specified in spp_sackdelay * is used to specify the sack delay for this address. Note * that if spp_address is empty then all addresses will * enable delayed sack and take on the sack delay * value specified in spp_sackdelay. * SPP_SACKDELAY_DISABLE - Setting this flag turns * off delayed sack. If the spp_address field is blank then * delayed sack is disabled for the entire association. Note * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. * * SPP_IPV6_FLOWLABEL: Setting this flag enables the * setting of the IPV6 flow label value. The value is * contained in the spp_ipv6_flowlabel field. * Upon retrieval, this flag will be set to indicate that * the spp_ipv6_flowlabel field has a valid value returned. * If a specific destination address is set (in the * spp_address field), then the value returned is that of * the address. If just an association is specified (and * no address), then the association's default flow label * is returned. If neither an association nor a destination * is specified, then the socket's default flow label is * returned. For non-IPv6 sockets, this flag will be left * cleared. * * SPP_DSCP: Setting this flag enables the setting of the * Differentiated Services Code Point (DSCP) value * associated with either the association or a specific * address. The value is obtained in the spp_dscp field. * Upon retrieval, this flag will be set to indicate that * the spp_dscp field has a valid value returned. If a * specific destination address is set when called (in the * spp_address field), then that specific destination * address's DSCP value is returned. If just an association * is specified, then the association's default DSCP is * returned. If neither an association nor a destination is * specified, then the socket's default DSCP is returned. * * spp_ipv6_flowlabel * - This field is used in conjunction with the * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. * The 20 least significant bits are used for the flow * label. This setting has precedence over any IPv6-layer * setting. * * spp_dscp - This field is used in conjunction with the SPP_DSCP flag * and contains the DSCP. The 6 most significant bits are * used for the DSCP. This setting has precedence over any * IPv4- or IPv6- layer setting. */ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, struct sctp_transport *trans, struct sctp_association *asoc, struct sctp_sock *sp, int hb_change, int pmtud_change, int sackdelay_change) { int error; if (params->spp_flags & SPP_HB_DEMAND && trans) { error = sctp_primitive_REQUESTHEARTBEAT(trans->asoc->base.net, trans->asoc, trans); if (error) return error; } /* Note that unless the spp_flag is set to SPP_HB_ENABLE the value of * this field is ignored. Note also that a value of zero indicates * the current setting should be left unchanged. */ if (params->spp_flags & SPP_HB_ENABLE) { /* Re-zero the interval if the SPP_HB_TIME_IS_ZERO is * set. This lets us use 0 value when this flag * is set. */ if (params->spp_flags & SPP_HB_TIME_IS_ZERO) params->spp_hbinterval = 0; if (params->spp_hbinterval || (params->spp_flags & SPP_HB_TIME_IS_ZERO)) { if (trans) { trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); sctp_transport_reset_hb_timer(trans); } else if (asoc) { asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); } else { sp->hbinterval = params->spp_hbinterval; } } } if (hb_change) { if (trans) { trans->param_flags = (trans->param_flags & ~SPP_HB) | hb_change; } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_HB) | hb_change; } else { sp->param_flags = (sp->param_flags & ~SPP_HB) | hb_change; } } /* When Path MTU discovery is disabled the value specified here will * be the "fixed" path mtu (i.e. the value of the spp_flags field must * include the flag SPP_PMTUD_DISABLE for this field to have any * effect). */ if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) { if (trans) { trans->pathmtu = params->spp_pathmtu; sctp_assoc_sync_pmtu(asoc); } else if (asoc) { sctp_assoc_set_pmtu(asoc, params->spp_pathmtu); } else { sp->pathmtu = params->spp_pathmtu; } } if (pmtud_change) { if (trans) { int update = (trans->param_flags & SPP_PMTUD_DISABLE) && (params->spp_flags & SPP_PMTUD_ENABLE); trans->param_flags = (trans->param_flags & ~SPP_PMTUD) | pmtud_change; if (update) { sctp_transport_pmtu(trans, sctp_opt2sk(sp)); sctp_assoc_sync_pmtu(asoc); } sctp_transport_pl_reset(trans); } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; } else { sp->param_flags = (sp->param_flags & ~SPP_PMTUD) | pmtud_change; } } /* Note that unless the spp_flag is set to SPP_SACKDELAY_ENABLE the * value of this field is ignored. Note also that a value of zero * indicates the current setting should be left unchanged. */ if ((params->spp_flags & SPP_SACKDELAY_ENABLE) && params->spp_sackdelay) { if (trans) { trans->sackdelay = msecs_to_jiffies(params->spp_sackdelay); } else if (asoc) { asoc->sackdelay = msecs_to_jiffies(params->spp_sackdelay); } else { sp->sackdelay = params->spp_sackdelay; } } if (sackdelay_change) { if (trans) { trans->param_flags = (trans->param_flags & ~SPP_SACKDELAY) | sackdelay_change; } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_SACKDELAY) | sackdelay_change; } else { sp->param_flags = (sp->param_flags & ~SPP_SACKDELAY) | sackdelay_change; } } /* Note that a value of zero indicates the current setting should be left unchanged. */ if (params->spp_pathmaxrxt) { if (trans) { trans->pathmaxrxt = params->spp_pathmaxrxt; } else if (asoc) { asoc->pathmaxrxt = params->spp_pathmaxrxt; } else { sp->pathmaxrxt = params->spp_pathmaxrxt; } } if (params->spp_flags & SPP_IPV6_FLOWLABEL) { if (trans) { if (trans->ipaddr.sa.sa_family == AF_INET6) { trans->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } } else if (asoc) { struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->ipaddr.sa.sa_family != AF_INET6) continue; t->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; t->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } asoc->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) { sp->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } } if (params->spp_flags & SPP_DSCP) { if (trans) { trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; trans->dscp |= SCTP_DSCP_SET_MASK; } else if (asoc) { struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; t->dscp |= SCTP_DSCP_SET_MASK; } asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; asoc->dscp |= SCTP_DSCP_SET_MASK; } else { sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; sp->dscp |= SCTP_DSCP_SET_MASK; } } return 0; } static int sctp_setsockopt_peer_addr_params(struct sock *sk, struct sctp_paddrparams *params, unsigned int optlen) { struct sctp_transport *trans = NULL; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); int error; int hb_change, pmtud_change, sackdelay_change; if (optlen == ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4)) { if (params->spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) return -EINVAL; } else if (optlen != sizeof(*params)) { return -EINVAL; } /* Validate flags and value parameters. */ hb_change = params->spp_flags & SPP_HB; pmtud_change = params->spp_flags & SPP_PMTUD; sackdelay_change = params->spp_flags & SPP_SACKDELAY; if (hb_change == SPP_HB || pmtud_change == SPP_PMTUD || sackdelay_change == SPP_SACKDELAY || params->spp_sackdelay > 500 || (params->spp_pathmtu && params->spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)&params->spp_address)) { trans = sctp_addr_id2transport(sk, &params->spp_address, params->spp_assoc_id); if (!trans) return -EINVAL; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params->spp_assoc_id); if (!asoc && params->spp_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Heartbeat demand can only be sent on a transport or * association, but not a socket. */ if (params->spp_flags & SPP_HB_DEMAND && !trans && !asoc) return -EINVAL; /* Process parameters. */ error = sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); if (error) return error; /* If changes are for association, also apply parameters to each * transport. */ if (!trans && asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); } } return 0; } static inline __u32 sctp_spp_sackdelay_enable(__u32 param_flags) { return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_ENABLE; } static inline __u32 sctp_spp_sackdelay_disable(__u32 param_flags) { return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_DISABLE; } static void sctp_apply_asoc_delayed_ack(struct sctp_sack_info *params, struct sctp_association *asoc) { struct sctp_transport *trans; if (params->sack_delay) { asoc->sackdelay = msecs_to_jiffies(params->sack_delay); asoc->param_flags = sctp_spp_sackdelay_enable(asoc->param_flags); } if (params->sack_freq == 1) { asoc->param_flags = sctp_spp_sackdelay_disable(asoc->param_flags); } else if (params->sack_freq > 1) { asoc->sackfreq = params->sack_freq; asoc->param_flags = sctp_spp_sackdelay_enable(asoc->param_flags); } list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { if (params->sack_delay) { trans->sackdelay = msecs_to_jiffies(params->sack_delay); trans->param_flags = sctp_spp_sackdelay_enable(trans->param_flags); } if (params->sack_freq == 1) { trans->param_flags = sctp_spp_sackdelay_disable(trans->param_flags); } else if (params->sack_freq > 1) { trans->sackfreq = params->sack_freq; trans->param_flags = sctp_spp_sackdelay_enable(trans->param_flags); } } } /* * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK) * * This option will effect the way delayed acks are performed. This * option allows you to get or set the delayed ack time, in * milliseconds. It also allows changing the delayed ack frequency. * Changing the frequency to 1 disables the delayed sack algorithm. If * the assoc_id is 0, then this sets or gets the endpoints default * values. If the assoc_id field is non-zero, then the set or get * effects the specified association for the one to many model (the * assoc_id field is ignored by the one to one model). Note that if * sack_delay or sack_freq are 0 when setting this option, then the * current values will remain unchanged. * * struct sctp_sack_info { * sctp_assoc_t sack_assoc_id; * uint32_t sack_delay; * uint32_t sack_freq; * }; * * sack_assoc_id - This parameter, indicates which association the user * is performing an action upon. Note that if this field's value is * zero then the endpoints default value is changed (effecting future * associations only). * * sack_delay - This parameter contains the number of milliseconds that * the user is requesting the delayed ACK timer be set to. Note that * this value is defined in the standard to be between 200 and 500 * milliseconds. * * sack_freq - This parameter contains the number of packets that must * be received before a sack is sent without waiting for the delay * timer to expire. The default value for this is 2, setting this * value to 1 will disable the delayed sack algorithm. */ static int __sctp_setsockopt_delayed_ack(struct sock *sk, struct sctp_sack_info *params) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; /* Validate value parameter. */ if (params->sack_delay > 500) return -EINVAL; /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params->sack_assoc_id); if (!asoc && params->sack_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { sctp_apply_asoc_delayed_ack(params, asoc); return 0; } if (sctp_style(sk, TCP)) params->sack_assoc_id = SCTP_FUTURE_ASSOC; if (params->sack_assoc_id == SCTP_FUTURE_ASSOC || params->sack_assoc_id == SCTP_ALL_ASSOC) { if (params->sack_delay) { sp->sackdelay = params->sack_delay; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } if (params->sack_freq == 1) { sp->param_flags = sctp_spp_sackdelay_disable(sp->param_flags); } else if (params->sack_freq > 1) { sp->sackfreq = params->sack_freq; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } } if (params->sack_assoc_id == SCTP_CURRENT_ASSOC || params->sack_assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) sctp_apply_asoc_delayed_ack(params, asoc); return 0; } static int sctp_setsockopt_delayed_ack(struct sock *sk, struct sctp_sack_info *params, unsigned int optlen) { if (optlen == sizeof(struct sctp_assoc_value)) { struct sctp_assoc_value *v = (struct sctp_assoc_value *)params; struct sctp_sack_info p; pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of struct sctp_assoc_value in delayed_ack socket option.\n" "Use struct sctp_sack_info instead\n", current->comm, task_pid_nr(current)); p.sack_assoc_id = v->assoc_id; p.sack_delay = v->assoc_value; p.sack_freq = v->assoc_value ? 0 : 1; return __sctp_setsockopt_delayed_ack(sk, &p); } if (optlen != sizeof(struct sctp_sack_info)) return -EINVAL; if (params->sack_delay == 0 && params->sack_freq == 0) return 0; return __sctp_setsockopt_delayed_ack(sk, params); } /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association * initialization. The option name argument to setsockopt() and getsockopt() * is SCTP_INITMSG. * * Setting initialization parameters is effective only on an unconnected * socket (for UDP-style sockets only future associations are effected * by the change). With TCP-style sockets, this option is inherited by * sockets derived from a listener socket. */ static int sctp_setsockopt_initmsg(struct sock *sk, struct sctp_initmsg *sinit, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof(struct sctp_initmsg)) return -EINVAL; if (sinit->sinit_num_ostreams) sp->initmsg.sinit_num_ostreams = sinit->sinit_num_ostreams; if (sinit->sinit_max_instreams) sp->initmsg.sinit_max_instreams = sinit->sinit_max_instreams; if (sinit->sinit_max_attempts) sp->initmsg.sinit_max_attempts = sinit->sinit_max_attempts; if (sinit->sinit_max_init_timeo) sp->initmsg.sinit_max_init_timeo = sinit->sinit_max_init_timeo; return 0; } /* * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) * * Applications that wish to use the sendto() system call may wish to * specify a default set of parameters that would normally be supplied * through the inclusion of ancillary data. This socket option allows * such an application to set the default sctp_sndrcvinfo structure. * The application that wishes to use this socket option simply passes * in to this call the sctp_sndrcvinfo structure defined in Section * 5.2.2) The input parameters accepted by this call include * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, * sinfo_timetolive. The user must provide the sinfo_assoc_id field in * to this call if the caller is using the UDP model. */ static int sctp_setsockopt_default_send_param(struct sock *sk, struct sctp_sndrcvinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen != sizeof(*info)) return -EINVAL; if (info->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; asoc = sctp_id2assoc(sk, info->sinfo_assoc_id); if (!asoc && info->sinfo_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->default_stream = info->sinfo_stream; asoc->default_flags = info->sinfo_flags; asoc->default_ppid = info->sinfo_ppid; asoc->default_context = info->sinfo_context; asoc->default_timetolive = info->sinfo_timetolive; return 0; } if (sctp_style(sk, TCP)) info->sinfo_assoc_id = SCTP_FUTURE_ASSOC; if (info->sinfo_assoc_id == SCTP_FUTURE_ASSOC || info->sinfo_assoc_id == SCTP_ALL_ASSOC) { sp->default_stream = info->sinfo_stream; sp->default_flags = info->sinfo_flags; sp->default_ppid = info->sinfo_ppid; sp->default_context = info->sinfo_context; sp->default_timetolive = info->sinfo_timetolive; } if (info->sinfo_assoc_id == SCTP_CURRENT_ASSOC || info->sinfo_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { asoc->default_stream = info->sinfo_stream; asoc->default_flags = info->sinfo_flags; asoc->default_ppid = info->sinfo_ppid; asoc->default_context = info->sinfo_context; asoc->default_timetolive = info->sinfo_timetolive; } } return 0; } /* RFC6458, Section 8.1.31. Set/get Default Send Parameters * (SCTP_DEFAULT_SNDINFO) */ static int sctp_setsockopt_default_sndinfo(struct sock *sk, struct sctp_sndinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen != sizeof(*info)) return -EINVAL; if (info->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; asoc = sctp_id2assoc(sk, info->snd_assoc_id); if (!asoc && info->snd_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->default_stream = info->snd_sid; asoc->default_flags = info->snd_flags; asoc->default_ppid = info->snd_ppid; asoc->default_context = info->snd_context; return 0; } if (sctp_style(sk, TCP)) info->snd_assoc_id = SCTP_FUTURE_ASSOC; if (info->snd_assoc_id == SCTP_FUTURE_ASSOC || info->snd_assoc_id == SCTP_ALL_ASSOC) { sp->default_stream = info->snd_sid; sp->default_flags = info->snd_flags; sp->default_ppid = info->snd_ppid; sp->default_context = info->snd_context; } if (info->snd_assoc_id == SCTP_CURRENT_ASSOC || info->snd_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { asoc->default_stream = info->snd_sid; asoc->default_flags = info->snd_flags; asoc->default_ppid = info->snd_ppid; asoc->default_context = info->snd_context; } } return 0; } /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) * * Requests that the local SCTP stack use the enclosed peer address as * the association primary. The enclosed address must be one of the * association peer's addresses. */ static int sctp_setsockopt_primary_addr(struct sock *sk, struct sctp_prim *prim, unsigned int optlen) { struct sctp_transport *trans; struct sctp_af *af; int err; if (optlen != sizeof(struct sctp_prim)) return -EINVAL; /* Allow security module to validate address but need address len. */ af = sctp_get_af_specific(prim->ssp_addr.ss_family); if (!af) return -EINVAL; err = security_sctp_bind_connect(sk, SCTP_PRIMARY_ADDR, (struct sockaddr *)&prim->ssp_addr, af->sockaddr_len); if (err) return err; trans = sctp_addr_id2transport(sk, &prim->ssp_addr, prim->ssp_assoc_id); if (!trans) return -EINVAL; sctp_assoc_set_primary(trans->asoc, trans); return 0; } /* * 7.1.5 SCTP_NODELAY * * Turn on/off any Nagle-like algorithm. This means that packets are * generally sent as soon as possible and no unnecessary delays are * introduced, at the cost of more packets in the network. Expects an * integer boolean flag. */ static int sctp_setsockopt_nodelay(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->nodelay = (*val == 0) ? 0 : 1; return 0; } /* * * 7.1.1 SCTP_RTOINFO * * The protocol parameters used to initialize and bound retransmission * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access * and modify these parameters. * All parameters are time values, in milliseconds. A value of 0, when * modifying the parameters, indicates that the current value should not * be changed. * */ static int sctp_setsockopt_rtoinfo(struct sock *sk, struct sctp_rtoinfo *rtoinfo, unsigned int optlen) { struct sctp_association *asoc; unsigned long rto_min, rto_max; struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof (struct sctp_rtoinfo)) return -EINVAL; asoc = sctp_id2assoc(sk, rtoinfo->srto_assoc_id); /* Set the values to the specific association */ if (!asoc && rtoinfo->srto_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; rto_max = rtoinfo->srto_max; rto_min = rtoinfo->srto_min; if (rto_max) rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max; else rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max; if (rto_min) rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min; else rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min; if (rto_min > rto_max) return -EINVAL; if (asoc) { if (rtoinfo->srto_initial != 0) asoc->rto_initial = msecs_to_jiffies(rtoinfo->srto_initial); asoc->rto_max = rto_max; asoc->rto_min = rto_min; } else { /* If there is no association or the association-id = 0 * set the values to the endpoint. */ if (rtoinfo->srto_initial != 0) sp->rtoinfo.srto_initial = rtoinfo->srto_initial; sp->rtoinfo.srto_max = rto_max; sp->rtoinfo.srto_min = rto_min; } return 0; } /* * * 7.1.2 SCTP_ASSOCINFO * * This option is used to tune the maximum retransmission attempts * of the association. * Returns an error if the new association retransmission value is * greater than the sum of the retransmission value of the peer. * See [SCTP] for more information. * */ static int sctp_setsockopt_associnfo(struct sock *sk, struct sctp_assocparams *assocparams, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assocparams)) return -EINVAL; asoc = sctp_id2assoc(sk, assocparams->sasoc_assoc_id); if (!asoc && assocparams->sasoc_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Set the values to the specific association */ if (asoc) { if (assocparams->sasoc_asocmaxrxt != 0) { __u32 path_sum = 0; int paths = 0; struct sctp_transport *peer_addr; list_for_each_entry(peer_addr, &asoc->peer.transport_addr_list, transports) { path_sum += peer_addr->pathmaxrxt; paths++; } /* Only validate asocmaxrxt if we have more than * one path/transport. We do this because path * retransmissions are only counted when we have more * then one path. */ if (paths > 1 && assocparams->sasoc_asocmaxrxt > path_sum) return -EINVAL; asoc->max_retrans = assocparams->sasoc_asocmaxrxt; } if (assocparams->sasoc_cookie_life != 0) asoc->cookie_life = ms_to_ktime(assocparams->sasoc_cookie_life); } else { /* Set the values to the endpoint */ struct sctp_sock *sp = sctp_sk(sk); if (assocparams->sasoc_asocmaxrxt != 0) sp->assocparams.sasoc_asocmaxrxt = assocparams->sasoc_asocmaxrxt; if (assocparams->sasoc_cookie_life != 0) sp->assocparams.sasoc_cookie_life = assocparams->sasoc_cookie_life; } return 0; } /* * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) * * This socket option is a boolean flag which turns on or off mapped V4 * addresses. If this option is turned on and the socket is type * PF_INET6, then IPv4 addresses will be mapped to V6 representation. * If this option is turned off, then no mapping will be done of V4 * addresses and a user will receive both PF_INET6 and PF_INET type * addresses on the socket. */ static int sctp_setsockopt_mappedv4(struct sock *sk, int *val, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; if (*val) sp->v4mapped = 1; else sp->v4mapped = 0; return 0; } /* * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG) * This option will get or set the maximum size to put in any outgoing * SCTP DATA chunk. If a message is larger than this size it will be * fragmented by SCTP into the specified size. Note that the underlying * SCTP implementation may fragment into smaller sized chunks when the * PMTU of the underlying association is smaller than the value set by * the user. The default value for this option is '0' which indicates * the user is NOT limiting fragmentation and only the PMTU will effect * SCTP's choice of DATA chunk size. Note also that values set larger * than the maximum size of an IP datagram will effectively let SCTP * control fragmentation (i.e. the same as setting this option to 0). * * The following structure is used to access and modify this parameter: * * struct sctp_assoc_value { * sctp_assoc_t assoc_id; * uint32_t assoc_value; * }; * * assoc_id: This parameter is ignored for one-to-one style sockets. * For one-to-many style sockets this parameter indicates which * association the user is performing an action upon. Note that if * this field's value is zero then the endpoints default value is * changed (effecting future associations only). * assoc_value: This parameter specifies the maximum size in bytes. */ static int sctp_setsockopt_maxseg(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; sctp_assoc_t assoc_id; int val; if (optlen == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in maxseg socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); assoc_id = SCTP_FUTURE_ASSOC; val = *(int *)params; } else if (optlen == sizeof(struct sctp_assoc_value)) { assoc_id = params->assoc_id; val = params->assoc_value; } else { return -EINVAL; } asoc = sctp_id2assoc(sk, assoc_id); if (!asoc && assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (val) { int min_len, max_len; __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) : sizeof(struct sctp_data_chunk); min_len = sctp_min_frag_point(sp, datasize); max_len = SCTP_MAX_CHUNK_LEN - datasize; if (val < min_len || val > max_len) return -EINVAL; } if (asoc) { asoc->user_frag = val; sctp_assoc_update_frag_point(asoc); } else { sp->user_frag = val; } return 0; } /* * 7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR) * * Requests that the peer mark the enclosed address as the association * primary. The enclosed address must be one of the association's * locally bound addresses. The following structure is used to make a * set primary request: */ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, struct sctp_setpeerprim *prim, unsigned int optlen) { struct sctp_sock *sp; struct sctp_association *asoc = NULL; struct sctp_chunk *chunk; struct sctp_af *af; int err; sp = sctp_sk(sk); if (!sp->ep->asconf_enable) return -EPERM; if (optlen != sizeof(struct sctp_setpeerprim)) return -EINVAL; asoc = sctp_id2assoc(sk, prim->sspp_assoc_id); if (!asoc) return -EINVAL; if (!asoc->peer.asconf_capable) return -EPERM; if (asoc->peer.addip_disabled_mask & SCTP_PARAM_SET_PRIMARY) return -EPERM; if (!sctp_state(asoc, ESTABLISHED)) return -ENOTCONN; af = sctp_get_af_specific(prim->sspp_addr.ss_family); if (!af) return -EINVAL; if (!af->addr_valid((union sctp_addr *)&prim->sspp_addr, sp, NULL)) return -EADDRNOTAVAIL; if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim->sspp_addr)) return -EADDRNOTAVAIL; /* Allow security module to validate address. */ err = security_sctp_bind_connect(sk, SCTP_SET_PEER_PRIMARY_ADDR, (struct sockaddr *)&prim->sspp_addr, af->sockaddr_len); if (err) return err; /* Create an ASCONF chunk with SET_PRIMARY parameter */ chunk = sctp_make_asconf_set_prim(asoc, (union sctp_addr *)&prim->sspp_addr); if (!chunk) return -ENOMEM; err = sctp_send_asconf(asoc, chunk); pr_debug("%s: we set peer primary addr primitively\n", __func__); return err; } static int sctp_setsockopt_adaptation_layer(struct sock *sk, struct sctp_setadaptation *adapt, unsigned int optlen) { if (optlen != sizeof(struct sctp_setadaptation)) return -EINVAL; sctp_sk(sk)->adaptation_ind = adapt->ssb_adaptation_ind; return 0; } /* * 7.1.29. Set or Get the default context (SCTP_CONTEXT) * * The context field in the sctp_sndrcvinfo structure is normally only * used when a failed message is retrieved holding the value that was * sent down on the actual send call. This option allows the setting of * a default context on an association basis that will be received on * reading messages from the peer. This is especially helpful in the * one-2-many model for an application to keep some reference to an * internal state machine that is processing messages on the * association. Note that the setting of this value only effects * received messages from the peer and does not effect the value that is * saved with outbound messages. */ static int sctp_setsockopt_context(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assoc_value)) return -EINVAL; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->default_rcv_context = params->assoc_value; return 0; } if (sctp_style(sk, TCP)) params->assoc_id = SCTP_FUTURE_ASSOC; if (params->assoc_id == SCTP_FUTURE_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) sp->default_rcv_context = params->assoc_value; if (params->assoc_id == SCTP_CURRENT_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) asoc->default_rcv_context = params->assoc_value; return 0; } /* * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) * * This options will at a minimum specify if the implementation is doing * fragmented interleave. Fragmented interleave, for a one to many * socket, is when subsequent calls to receive a message may return * parts of messages from different associations. Some implementations * may allow you to turn this value on or off. If so, when turned off, * no fragment interleave will occur (which will cause a head of line * blocking amongst multiple associations sharing the same one to many * socket). When this option is turned on, then each receive call may * come from a different association (thus the user must receive data * with the extended calls (e.g. sctp_recvmsg) to keep track of which * association each receive belongs to. * * This option takes a boolean value. A non-zero value indicates that * fragmented interleave is on. A value of zero indicates that * fragmented interleave is off. * * Note that it is important that an implementation that allows this * option to be turned on, have it off by default. Otherwise an unaware * application using the one to many model may become confused and act * incorrectly. */ static int sctp_setsockopt_fragment_interleave(struct sock *sk, int *val, unsigned int optlen) { if (optlen != sizeof(int)) return -EINVAL; sctp_sk(sk)->frag_interleave = !!*val; if (!sctp_sk(sk)->frag_interleave) sctp_sk(sk)->ep->intl_enable = 0; return 0; } /* * 8.1.21. Set or Get the SCTP Partial Delivery Point * (SCTP_PARTIAL_DELIVERY_POINT) * * This option will set or get the SCTP partial delivery point. This * point is the size of a message where the partial delivery API will be * invoked to help free up rwnd space for the peer. Setting this to a * lower value will cause partial deliveries to happen more often. The * calls argument is an integer that sets or gets the partial delivery * point. Note also that the call will fail if the user attempts to set * this value larger than the socket receive buffer size. * * Note that any single message having a length smaller than or equal to * the SCTP partial delivery point will be delivered in one single read * call as long as the user provided buffer is large enough to hold the * message. */ static int sctp_setsockopt_partial_delivery_point(struct sock *sk, u32 *val, unsigned int optlen) { if (optlen != sizeof(u32)) return -EINVAL; /* Note: We double the receive buffer from what the user sets * it to be, also initial rwnd is based on rcvbuf/2. */ if (*val > (sk->sk_rcvbuf >> 1)) return -EINVAL; sctp_sk(sk)->pd_point = *val; return 0; /* is this the right error code? */ } /* * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) * * This option will allow a user to change the maximum burst of packets * that can be emitted by this association. Note that the default value * is 4, and some implementations may restrict this setting so that it * can only be lowered. * * NOTE: This text doesn't seem right. Do this on a socket basis with * future associations inheriting the socket value. */ static int sctp_setsockopt_maxburst(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; sctp_assoc_t assoc_id; u32 assoc_value; if (optlen == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in max_burst socket option deprecated.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); assoc_id = SCTP_FUTURE_ASSOC; assoc_value = *((int *)params); } else if (optlen == sizeof(struct sctp_assoc_value)) { assoc_id = params->assoc_id; assoc_value = params->assoc_value; } else return -EINVAL; asoc = sctp_id2assoc(sk, assoc_id); if (!asoc && assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->max_burst = assoc_value; return 0; } if (sctp_style(sk, TCP)) assoc_id = SCTP_FUTURE_ASSOC; if (assoc_id == SCTP_FUTURE_ASSOC || assoc_id == SCTP_ALL_ASSOC) sp->max_burst = assoc_value; if (assoc_id == SCTP_CURRENT_ASSOC || assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) asoc->max_burst = assoc_value; return 0; } /* * 7.1.18. Add a chunk that must be authenticated (SCTP_AUTH_CHUNK) * * This set option adds a chunk type that the user is requesting to be * received only in an authenticated way. Changes to the list of chunks * will only effect future associations on the socket. */ static int sctp_setsockopt_auth_chunk(struct sock *sk, struct sctp_authchunk *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authchunk)) return -EINVAL; switch (val->sauth_chunk) { case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: case SCTP_CID_AUTH: return -EINVAL; } /* add this chunk id to the endpoint */ return sctp_auth_ep_add_chunkid(ep, val->sauth_chunk); } /* * 7.1.19. Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT) * * This option gets or sets the list of HMAC algorithms that the local * endpoint requires the peer to use. */ static int sctp_setsockopt_hmac_ident(struct sock *sk, struct sctp_hmacalgo *hmacs, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; u32 idents; if (!ep->auth_enable) return -EACCES; if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + SCTP_AUTH_NUM_HMACS * sizeof(u16)); idents = hmacs->shmac_num_idents; if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS || (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) return -EINVAL; return sctp_auth_ep_set_hmacs(ep, hmacs); } /* * 7.1.20. Set a shared key (SCTP_AUTH_KEY) * * This option will set a shared secret key which is used to build an * association shared key. */ static int sctp_setsockopt_auth_key(struct sock *sk, struct sctp_authkey *authkey, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = -EINVAL; if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; /* authkey->sca_keylength is u16, so optlen can't be bigger than * this. */ optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(*authkey)); if (authkey->sca_keylength > optlen - sizeof(*authkey)) goto out; asoc = sctp_id2assoc(sk, authkey->sca_assoc_id); if (!asoc && authkey->sca_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) { ret = sctp_auth_set_key(ep, asoc, authkey); goto out; } if (sctp_style(sk, TCP)) authkey->sca_assoc_id = SCTP_FUTURE_ASSOC; if (authkey->sca_assoc_id == SCTP_FUTURE_ASSOC || authkey->sca_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_set_key(ep, asoc, authkey); if (ret) goto out; } ret = 0; if (authkey->sca_assoc_id == SCTP_CURRENT_ASSOC || authkey->sca_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_set_key(ep, asoc, authkey); if (res && !ret) ret = res; } } out: memzero_explicit(authkey, optlen); return ret; } /* * 7.1.21. Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY) * * This option will get or set the active shared key to be used to build * the association shared key. */ static int sctp_setsockopt_active_key(struct sock *sk, struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; asoc = sctp_id2assoc(sk, val->scact_assoc_id); if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) val->scact_assoc_id = SCTP_FUTURE_ASSOC; if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (ret) return ret; } if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (res && !ret) ret = res; } } return ret; } /* * 7.1.22. Delete a shared key (SCTP_AUTH_DELETE_KEY) * * This set option will delete a shared secret key from use. */ static int sctp_setsockopt_del_key(struct sock *sk, struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; asoc = sctp_id2assoc(sk, val->scact_assoc_id); if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) val->scact_assoc_id = SCTP_FUTURE_ASSOC; if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (res && !ret) ret = res; } } return ret; } /* * 8.3.4 Deactivate a Shared Key (SCTP_AUTH_DEACTIVATE_KEY) * * This set option will deactivate a shared secret key. */ static int sctp_setsockopt_deactivate_key(struct sock *sk, struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; asoc = sctp_id2assoc(sk, val->scact_assoc_id); if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) val->scact_assoc_id = SCTP_FUTURE_ASSOC; if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (res && !ret) ret = res; } } return ret; } /* * 8.1.23 SCTP_AUTO_ASCONF * * This option will enable or disable the use of the automatic generation of * ASCONF chunks to add and delete addresses to an existing association. Note * that this option has two caveats namely: a) it only affects sockets that * are bound to all addresses available to the SCTP stack, and b) the system * administrator may have an overriding control that turns the ASCONF feature * off no matter what setting the socket option may have. * This option expects an integer boolean flag, where a non-zero value turns on * the option, and a zero value turns off the option. * Note. In this implementation, socket operation overrides default parameter * being set by sysctl as well as FreeBSD implementation */ static int sctp_setsockopt_auto_asconf(struct sock *sk, int *val, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; if (!sctp_is_ep_boundall(sk) && *val) return -EINVAL; if ((*val && sp->do_auto_asconf) || (!*val && !sp->do_auto_asconf)) return 0; spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); if (*val == 0 && sp->do_auto_asconf) { list_del(&sp->auto_asconf_list); sp->do_auto_asconf = 0; } else if (*val && !sp->do_auto_asconf) { list_add_tail(&sp->auto_asconf_list, &sock_net(sk)->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; } spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock); return 0; } /* * SCTP_PEER_ADDR_THLDS * * This option allows us to alter the partially failed threshold for one or all * transports in an association. See Section 6.1 of: * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, struct sctp_paddrthlds_v2 *val, unsigned int optlen, bool v2) { struct sctp_transport *trans; struct sctp_association *asoc; int len; len = v2 ? sizeof(*val) : sizeof(struct sctp_paddrthlds); if (optlen < len) return -EINVAL; if (v2 && val->spt_pathpfthld > val->spt_pathcpthld) return -EINVAL; if (!sctp_is_any(sk, (const union sctp_addr *)&val->spt_address)) { trans = sctp_addr_id2transport(sk, &val->spt_address, val->spt_assoc_id); if (!trans) return -ENOENT; if (val->spt_pathmaxrxt) trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) trans->ps_retrans = val->spt_pathcpthld; trans->pf_retrans = val->spt_pathpfthld; return 0; } asoc = sctp_id2assoc(sk, val->spt_assoc_id); if (!asoc && val->spt_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { if (val->spt_pathmaxrxt) trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) trans->ps_retrans = val->spt_pathcpthld; trans->pf_retrans = val->spt_pathpfthld; } if (val->spt_pathmaxrxt) asoc->pathmaxrxt = val->spt_pathmaxrxt; if (v2) asoc->ps_retrans = val->spt_pathcpthld; asoc->pf_retrans = val->spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); if (val->spt_pathmaxrxt) sp->pathmaxrxt = val->spt_pathmaxrxt; if (v2) sp->ps_retrans = val->spt_pathcpthld; sp->pf_retrans = val->spt_pathpfthld; } return 0; } static int sctp_setsockopt_recvrcvinfo(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->recvrcvinfo = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_recvnxtinfo(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->recvnxtinfo = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_pr_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(*params)) return -EINVAL; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; sctp_sk(sk)->ep->prsctp_enable = !!params->assoc_value; return 0; } static int sctp_setsockopt_default_prinfo(struct sock *sk, struct sctp_default_prinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*info)) goto out; if (info->pr_policy & ~SCTP_PR_SCTP_MASK) goto out; if (info->pr_policy == SCTP_PR_SCTP_NONE) info->pr_value = 0; asoc = sctp_id2assoc(sk, info->pr_assoc_id); if (!asoc && info->pr_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { SCTP_PR_SET_POLICY(asoc->default_flags, info->pr_policy); asoc->default_timetolive = info->pr_value; goto out; } if (sctp_style(sk, TCP)) info->pr_assoc_id = SCTP_FUTURE_ASSOC; if (info->pr_assoc_id == SCTP_FUTURE_ASSOC || info->pr_assoc_id == SCTP_ALL_ASSOC) { SCTP_PR_SET_POLICY(sp->default_flags, info->pr_policy); sp->default_timetolive = info->pr_value; } if (info->pr_assoc_id == SCTP_CURRENT_ASSOC || info->pr_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { SCTP_PR_SET_POLICY(asoc->default_flags, info->pr_policy); asoc->default_timetolive = info->pr_value; } } out: return retval; } static int sctp_setsockopt_reconfig_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; sctp_sk(sk)->ep->reconf_enable = !!params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_enable_strreset(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; if (params->assoc_value & (~SCTP_ENABLE_STRRESET_MASK)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { asoc->strreset_enable = params->assoc_value; goto out; } if (sctp_style(sk, TCP)) params->assoc_id = SCTP_FUTURE_ASSOC; if (params->assoc_id == SCTP_FUTURE_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) ep->strreset_enable = params->assoc_value; if (params->assoc_id == SCTP_CURRENT_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &ep->asocs, asocs) asoc->strreset_enable = params->assoc_value; out: return retval; } static int sctp_setsockopt_reset_streams(struct sock *sk, struct sctp_reset_streams *params, unsigned int optlen) { struct sctp_association *asoc; if (optlen < sizeof(*params)) return -EINVAL; /* srs_number_streams is u16, so optlen can't be bigger than this. */ optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(__u16) * sizeof(*params)); if (params->srs_number_streams * sizeof(__u16) > optlen - sizeof(*params)) return -EINVAL; asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) return -EINVAL; return sctp_send_reset_streams(asoc, params); } static int sctp_setsockopt_reset_assoc(struct sock *sk, sctp_assoc_t *associd, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(*associd)) return -EINVAL; asoc = sctp_id2assoc(sk, *associd); if (!asoc) return -EINVAL; return sctp_send_reset_assoc(asoc); } static int sctp_setsockopt_add_streams(struct sock *sk, struct sctp_add_streams *params, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(*params)) return -EINVAL; asoc = sctp_id2assoc(sk, params->sas_assoc_id); if (!asoc) return -EINVAL; return sctp_send_add_streams(asoc, params); } static int sctp_setsockopt_scheduler(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int retval = 0; if (optlen < sizeof(*params)) return -EINVAL; if (params->assoc_value > SCTP_SS_MAX) return -EINVAL; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_sched_set_sched(asoc, params->assoc_value); if (sctp_style(sk, TCP)) params->assoc_id = SCTP_FUTURE_ASSOC; if (params->assoc_id == SCTP_FUTURE_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) sp->default_ss = params->assoc_value; if (params->assoc_id == SCTP_CURRENT_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { int ret = sctp_sched_set_sched(asoc, params->assoc_value); if (ret && !retval) retval = ret; } } return retval; } static int sctp_setsockopt_scheduler_value(struct sock *sk, struct sctp_stream_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen < sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_CURRENT_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) { retval = sctp_sched_set_value(asoc, params->stream_id, params->stream_value, GFP_KERNEL); goto out; } retval = 0; list_for_each_entry(asoc, &sctp_sk(sk)->ep->asocs, asocs) { int ret = sctp_sched_set_value(asoc, params->stream_id, params->stream_value, GFP_KERNEL); if (ret && !retval) /* try to return the 1st error. */ retval = ret; } out: return retval; } static int sctp_setsockopt_interleaving_supported(struct sock *sk, struct sctp_assoc_value *p, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen < sizeof(*p)) return -EINVAL; asoc = sctp_id2assoc(sk, p->assoc_id); if (!asoc && p->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (!sock_net(sk)->sctp.intl_enable || !sp->frag_interleave) { return -EPERM; } sp->ep->intl_enable = !!p->assoc_value; return 0; } static int sctp_setsockopt_reuse_port(struct sock *sk, int *val, unsigned int optlen) { if (!sctp_style(sk, TCP)) return -EOPNOTSUPP; if (sctp_sk(sk)->ep->base.bind_addr.port) return -EFAULT; if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->reuse = !!*val; return 0; } static int sctp_assoc_ulpevent_type_set(struct sctp_event *param, struct sctp_association *asoc) { struct sctp_ulpevent *event; sctp_ulpevent_type_set(&asoc->subscribe, param->se_type, param->se_on); if (param->se_type == SCTP_SENDER_DRY_EVENT && param->se_on) { if (sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; asoc->stream.si->enqueue_event(&asoc->ulpq, event); } } return 0; } static int sctp_setsockopt_event(struct sock *sk, struct sctp_event *param, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int retval = 0; if (optlen < sizeof(*param)) return -EINVAL; if (param->se_type < SCTP_SN_TYPE_BASE || param->se_type > SCTP_SN_TYPE_MAX) return -EINVAL; asoc = sctp_id2assoc(sk, param->se_assoc_id); if (!asoc && param->se_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_assoc_ulpevent_type_set(param, asoc); if (sctp_style(sk, TCP)) param->se_assoc_id = SCTP_FUTURE_ASSOC; if (param->se_assoc_id == SCTP_FUTURE_ASSOC || param->se_assoc_id == SCTP_ALL_ASSOC) sctp_ulpevent_type_set(&sp->subscribe, param->se_type, param->se_on); if (param->se_assoc_id == SCTP_CURRENT_ASSOC || param->se_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { int ret = sctp_assoc_ulpevent_type_set(param, asoc); if (ret && !retval) retval = ret; } } return retval; } static int sctp_setsockopt_asconf_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; ep->asconf_enable = !!params->assoc_value; if (ep->asconf_enable && ep->auth_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); } retval = 0; out: return retval; } static int sctp_setsockopt_auth_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; if (params->assoc_value) { retval = sctp_auth_init(ep, GFP_KERNEL); if (retval) goto out; if (ep->asconf_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); } } ep->auth_enable = !!params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_ecn_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; sctp_sk(sk)->ep->ecn_enable = !!params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_pf_expose(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; if (params->assoc_value > SCTP_PF_EXPOSE_MAX) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) asoc->pf_expose = params->assoc_value; else sctp_sk(sk)->pf_expose = params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_encap_port(struct sock *sk, struct sctp_udpencaps *encap, unsigned int optlen) { struct sctp_association *asoc; struct sctp_transport *t; __be16 encap_port; if (optlen != sizeof(*encap)) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ encap_port = (__force __be16)encap->sue_port; if (!sctp_is_any(sk, (union sctp_addr *)&encap->sue_address)) { t = sctp_addr_id2transport(sk, &encap->sue_address, encap->sue_assoc_id); if (!t) return -EINVAL; t->encap_port = encap_port; return 0; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, encap->sue_assoc_id); if (!asoc && encap->sue_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* If changes are for association, also apply encap_port to * each transport. */ if (asoc) { list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) t->encap_port = encap_port; asoc->encap_port = encap_port; return 0; } sctp_sk(sk)->encap_port = encap_port; return 0; } static int sctp_setsockopt_probe_interval(struct sock *sk, struct sctp_probeinterval *params, unsigned int optlen) { struct sctp_association *asoc; struct sctp_transport *t; __u32 probe_interval; if (optlen != sizeof(*params)) return -EINVAL; probe_interval = params->spi_interval; if (probe_interval && probe_interval < SCTP_PROBE_TIMER_MIN) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)&params->spi_address)) { t = sctp_addr_id2transport(sk, &params->spi_address, params->spi_assoc_id); if (!t) return -EINVAL; t->probe_interval = msecs_to_jiffies(probe_interval); sctp_transport_pl_reset(t); return 0; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params->spi_assoc_id); if (!asoc && params->spi_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* If changes are for association, also apply probe_interval to * each transport. */ if (asoc) { list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->probe_interval = msecs_to_jiffies(probe_interval); sctp_transport_pl_reset(t); } asoc->probe_interval = msecs_to_jiffies(probe_interval); return 0; } sctp_sk(sk)->probe_interval = probe_interval; return 0; } /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve * socket options. Socket options are used to change the default * behavior of sockets calls. They are described in Section 7. * * The syntax is: * * ret = getsockopt(int sd, int level, int optname, void __user *optval, * int __user *optlen); * ret = setsockopt(int sd, int level, int optname, const void __user *optval, * int optlen); * * sd - the socket descript. * level - set to IPPROTO_SCTP for all SCTP options. * optname - the option name. * optval - the buffer to store the value of the option. * optlen - the size of the buffer. */ static int sctp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { void *kopt = NULL; int retval = 0; pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname); /* I can hardly begin to describe how wrong this is. This is * so broken as to be worse than useless. The API draft * REALLY is NOT helpful here... I am not convinced that the * semantics of setsockopt() with a level OTHER THAN SOL_SCTP * are at all well-founded. */ if (level != SOL_SCTP) { struct sctp_af *af = sctp_sk(sk)->pf->af; return af->setsockopt(sk, level, optname, optval, optlen); } if (optlen > 0) { /* Trim it to the biggest size sctp sockopt may need if necessary */ optlen = min_t(unsigned int, optlen, PAGE_ALIGN(USHRT_MAX + sizeof(__u16) * sizeof(struct sctp_reset_streams))); kopt = memdup_sockptr(optval, optlen); if (IS_ERR(kopt)) return PTR_ERR(kopt); } lock_sock(sk); switch (optname) { case SCTP_SOCKOPT_BINDX_ADD: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_bindx(sk, kopt, optlen, SCTP_BINDX_ADD_ADDR); break; case SCTP_SOCKOPT_BINDX_REM: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_bindx(sk, kopt, optlen, SCTP_BINDX_REM_ADDR); break; case SCTP_SOCKOPT_CONNECTX_OLD: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_connectx_old(sk, kopt, optlen); break; case SCTP_SOCKOPT_CONNECTX: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_connectx(sk, kopt, optlen); break; case SCTP_DISABLE_FRAGMENTS: retval = sctp_setsockopt_disable_fragments(sk, kopt, optlen); break; case SCTP_EVENTS: retval = sctp_setsockopt_events(sk, kopt, optlen); break; case SCTP_AUTOCLOSE: retval = sctp_setsockopt_autoclose(sk, kopt, optlen); break; case SCTP_PEER_ADDR_PARAMS: retval = sctp_setsockopt_peer_addr_params(sk, kopt, optlen); break; case SCTP_DELAYED_SACK: retval = sctp_setsockopt_delayed_ack(sk, kopt, optlen); break; case SCTP_PARTIAL_DELIVERY_POINT: retval = sctp_setsockopt_partial_delivery_point(sk, kopt, optlen); break; case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, kopt, optlen); break; case SCTP_DEFAULT_SEND_PARAM: retval = sctp_setsockopt_default_send_param(sk, kopt, optlen); break; case SCTP_DEFAULT_SNDINFO: retval = sctp_setsockopt_default_sndinfo(sk, kopt, optlen); break; case SCTP_PRIMARY_ADDR: retval = sctp_setsockopt_primary_addr(sk, kopt, optlen); break; case SCTP_SET_PEER_PRIMARY_ADDR: retval = sctp_setsockopt_peer_primary_addr(sk, kopt, optlen); break; case SCTP_NODELAY: retval = sctp_setsockopt_nodelay(sk, kopt, optlen); break; case SCTP_RTOINFO: retval = sctp_setsockopt_rtoinfo(sk, kopt, optlen); break; case SCTP_ASSOCINFO: retval = sctp_setsockopt_associnfo(sk, kopt, optlen); break; case SCTP_I_WANT_MAPPED_V4_ADDR: retval = sctp_setsockopt_mappedv4(sk, kopt, optlen); break; case SCTP_MAXSEG: retval = sctp_setsockopt_maxseg(sk, kopt, optlen); break; case SCTP_ADAPTATION_LAYER: retval = sctp_setsockopt_adaptation_layer(sk, kopt, optlen); break; case SCTP_CONTEXT: retval = sctp_setsockopt_context(sk, kopt, optlen); break; case SCTP_FRAGMENT_INTERLEAVE: retval = sctp_setsockopt_fragment_interleave(sk, kopt, optlen); break; case SCTP_MAX_BURST: retval = sctp_setsockopt_maxburst(sk, kopt, optlen); break; case SCTP_AUTH_CHUNK: retval = sctp_setsockopt_auth_chunk(sk, kopt, optlen); break; case SCTP_HMAC_IDENT: retval = sctp_setsockopt_hmac_ident(sk, kopt, optlen); break; case SCTP_AUTH_KEY: retval = sctp_setsockopt_auth_key(sk, kopt, optlen); break; case SCTP_AUTH_ACTIVE_KEY: retval = sctp_setsockopt_active_key(sk, kopt, optlen); break; case SCTP_AUTH_DELETE_KEY: retval = sctp_setsockopt_del_key(sk, kopt, optlen); break; case SCTP_AUTH_DEACTIVATE_KEY: retval = sctp_setsockopt_deactivate_key(sk, kopt, optlen); break; case SCTP_AUTO_ASCONF: retval = sctp_setsockopt_auto_asconf(sk, kopt, optlen); break; case SCTP_PEER_ADDR_THLDS: retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, false); break; case SCTP_PEER_ADDR_THLDS_V2: retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, true); break; case SCTP_RECVRCVINFO: retval = sctp_setsockopt_recvrcvinfo(sk, kopt, optlen); break; case SCTP_RECVNXTINFO: retval = sctp_setsockopt_recvnxtinfo(sk, kopt, optlen); break; case SCTP_PR_SUPPORTED: retval = sctp_setsockopt_pr_supported(sk, kopt, optlen); break; case SCTP_DEFAULT_PRINFO: retval = sctp_setsockopt_default_prinfo(sk, kopt, optlen); break; case SCTP_RECONFIG_SUPPORTED: retval = sctp_setsockopt_reconfig_supported(sk, kopt, optlen); break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_setsockopt_enable_strreset(sk, kopt, optlen); break; case SCTP_RESET_STREAMS: retval = sctp_setsockopt_reset_streams(sk, kopt, optlen); break; case SCTP_RESET_ASSOC: retval = sctp_setsockopt_reset_assoc(sk, kopt, optlen); break; case SCTP_ADD_STREAMS: retval = sctp_setsockopt_add_streams(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER: retval = sctp_setsockopt_scheduler(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_setsockopt_scheduler_value(sk, kopt, optlen); break; case SCTP_INTERLEAVING_SUPPORTED: retval = sctp_setsockopt_interleaving_supported(sk, kopt, optlen); break; case SCTP_REUSE_PORT: retval = sctp_setsockopt_reuse_port(sk, kopt, optlen); break; case SCTP_EVENT: retval = sctp_setsockopt_event(sk, kopt, optlen); break; case SCTP_ASCONF_SUPPORTED: retval = sctp_setsockopt_asconf_supported(sk, kopt, optlen); break; case SCTP_AUTH_SUPPORTED: retval = sctp_setsockopt_auth_supported(sk, kopt, optlen); break; case SCTP_ECN_SUPPORTED: retval = sctp_setsockopt_ecn_supported(sk, kopt, optlen); break; case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_setsockopt_pf_expose(sk, kopt, optlen); break; case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_setsockopt_encap_port(sk, kopt, optlen); break; case SCTP_PLPMTUD_PROBE_INTERVAL: retval = sctp_setsockopt_probe_interval(sk, kopt, optlen); break; default: retval = -ENOPROTOOPT; break; } release_sock(sk); kfree(kopt); return retval; } /* API 3.1.6 connect() - UDP Style Syntax * * An application may use the connect() call in the UDP model to initiate an * association without sending data. * * The syntax is: * * ret = connect(int sd, const struct sockaddr *nam, socklen_t len); * * sd: the socket descriptor to have a new association added to. * * nam: the address structure (either struct sockaddr_in or struct * sockaddr_in6 defined in RFC2553 [7]). * * len: the size of the address. */ static int sctp_connect(struct sock *sk, struct sockaddr *addr, int addr_len, int flags) { struct sctp_af *af; int err = -EINVAL; lock_sock(sk); pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); /* Validate addr_len before calling common connect/connectx routine. */ af = sctp_get_af_specific(addr->sa_family); if (af && addr_len >= af->sockaddr_len) err = __sctp_connect(sk, addr, af->sockaddr_len, flags, NULL); release_sock(sk); return err; } int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return -EOPNOTSUPP; return sctp_connect(sock->sk, uaddr, addr_len, flags); } /* FIXME: Write comments. */ static int sctp_disconnect(struct sock *sk, int flags) { return -EOPNOTSUPP; /* STUB */ } /* 4.1.4 accept() - TCP Style Syntax * * Applications use accept() call to remove an established SCTP * association from the accept queue of the endpoint. A new socket * descriptor will be returned from accept() to represent the newly * formed association. */ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) { struct sctp_sock *sp; struct sctp_endpoint *ep; struct sock *newsk = NULL; struct sctp_association *asoc; long timeo; int error = 0; lock_sock(sk); sp = sctp_sk(sk); ep = sp->ep; if (!sctp_style(sk, TCP)) { error = -EOPNOTSUPP; goto out; } if (!sctp_sstate(sk, LISTENING)) { error = -EINVAL; goto out; } timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); error = sctp_wait_for_accept(sk, timeo); if (error) goto out; /* We treat the list of associations on the endpoint as the accept * queue and pick the first association on the list. */ asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); newsk = sp->pf->create_accept_sk(sk, asoc, kern); if (!newsk) { error = -ENOMEM; goto out; } /* Populate the fields of the newsk from the oldsk and migrate the * asoc to the newsk. */ error = sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP); if (error) { sk_common_release(newsk); newsk = NULL; } out: release_sock(sk); *err = error; return newsk; } /* The SCTP ioctl handler. */ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) { int rc = -ENOTCONN; lock_sock(sk); /* * SEQPACKET-style sockets in LISTENING state are valid, for * SCTP, so only discard TCP-style sockets in LISTENING state. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) goto out; switch (cmd) { case SIOCINQ: { struct sk_buff *skb; unsigned int amount = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount of this packet since * that is all that will be read. */ amount = skb->len; } rc = put_user(amount, (int __user *)arg); break; } default: rc = -ENOIOCTLCMD; break; } out: release_sock(sk); return rc; } /* This is the function which gets called during socket creation to * initialized the SCTP-specific portion of the sock. * The sock structure should already be zero-filled memory. */ static int sctp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); struct sctp_sock *sp; pr_debug("%s: sk:%p\n", __func__, sk); sp = sctp_sk(sk); /* Initialize the SCTP per socket area. */ switch (sk->sk_type) { case SOCK_SEQPACKET: sp->type = SCTP_SOCKET_UDP; break; case SOCK_STREAM: sp->type = SCTP_SOCKET_TCP; break; default: return -ESOCKTNOSUPPORT; } sk->sk_gso_type = SKB_GSO_SCTP; /* Initialize default send parameters. These parameters can be * modified with the SCTP_DEFAULT_SEND_PARAM socket option. */ sp->default_stream = 0; sp->default_ppid = 0; sp->default_flags = 0; sp->default_context = 0; sp->default_timetolive = 0; sp->default_rcv_context = 0; sp->max_burst = net->sctp.max_burst; sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or * overridden by the SCTP_INIT CMSG. */ sp->initmsg.sinit_num_ostreams = sctp_max_outstreams; sp->initmsg.sinit_max_instreams = sctp_max_instreams; sp->initmsg.sinit_max_attempts = net->sctp.max_retrans_init; sp->initmsg.sinit_max_init_timeo = net->sctp.rto_max; /* Initialize default RTO related parameters. These parameters can * be modified for with the SCTP_RTOINFO socket option. */ sp->rtoinfo.srto_initial = net->sctp.rto_initial; sp->rtoinfo.srto_max = net->sctp.rto_max; sp->rtoinfo.srto_min = net->sctp.rto_min; /* Initialize default association related parameters. These parameters * can be modified with the SCTP_ASSOCINFO socket option. */ sp->assocparams.sasoc_asocmaxrxt = net->sctp.max_retrans_association; sp->assocparams.sasoc_number_peer_destinations = 0; sp->assocparams.sasoc_peer_rwnd = 0; sp->assocparams.sasoc_local_rwnd = 0; sp->assocparams.sasoc_cookie_life = net->sctp.valid_cookie_life; /* Initialize default event subscriptions. By default, all the * options are off. */ sp->subscribe = 0; /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS */ sp->hbinterval = net->sctp.hb_interval; sp->udp_port = htons(net->sctp.udp_port); sp->encap_port = htons(net->sctp.encap_port); sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; sp->ps_retrans = net->sctp.ps_retrans; sp->pf_expose = net->sctp.pf_expose; sp->pathmtu = 0; /* allow default discovery */ sp->sackdelay = net->sctp.sack_timeout; sp->sackfreq = 2; sp->param_flags = SPP_HB_ENABLE | SPP_PMTUD_ENABLE | SPP_SACKDELAY_ENABLE; sp->default_ss = SCTP_SS_DEFAULT; /* If enabled no SCTP message fragmentation will be performed. * Configure through SCTP_DISABLE_FRAGMENTS socket option. */ sp->disable_fragments = 0; /* Enable Nagle algorithm by default. */ sp->nodelay = 0; sp->recvrcvinfo = 0; sp->recvnxtinfo = 0; /* Enable by default. */ sp->v4mapped = 1; /* Auto-close idle associations after the configured * number of seconds. A value of 0 disables this * feature. Configure through the SCTP_AUTOCLOSE socket option, * for UDP-style sockets only. */ sp->autoclose = 0; /* User specified fragmentation limit. */ sp->user_frag = 0; sp->adaptation_ind = 0; sp->pf = sctp_get_pf_specific(sk->sk_family); /* Control variables for partial data delivery. */ atomic_set(&sp->pd_mode, 0); skb_queue_head_init(&sp->pd_lobby); sp->frag_interleave = 0; sp->probe_interval = net->sctp.probe_interval; /* Create a per socket endpoint structure. Even if we * change the data structure relationships, this may still * be useful for storing pre-connect address information. */ sp->ep = sctp_endpoint_new(sk, GFP_KERNEL); if (!sp->ep) return -ENOMEM; sp->hmac = NULL; sk->sk_destruct = sctp_destruct_sock; SCTP_DBG_OBJCNT_INC(sock); sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); return 0; } /* Cleanup any SCTP per socket resources. Must be called with * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true */ static void sctp_destroy_sock(struct sock *sk) { struct sctp_sock *sp; pr_debug("%s: sk:%p\n", __func__, sk); /* Release our hold on the endpoint. */ sp = sctp_sk(sk); /* This could happen during socket init, thus we bail out * early, since the rest of the below is not setup either. */ if (sp->ep == NULL) return; if (sp->do_auto_asconf) { sp->do_auto_asconf = 0; list_del(&sp->auto_asconf_list); } sctp_endpoint_free(sp->ep); sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } /* Triggered when there are no references on the socket anymore */ static void sctp_destruct_common(struct sock *sk) { struct sctp_sock *sp = sctp_sk(sk); /* Free up the HMAC transform. */ crypto_free_shash(sp->hmac); } static void sctp_destruct_sock(struct sock *sk) { sctp_destruct_common(sk); inet_sock_destruct(sk); } /* API 4.1.7 shutdown() - TCP Style Syntax * int shutdown(int socket, int how); * * sd - the socket descriptor of the association to be closed. * how - Specifies the type of shutdown. The values are * as follows: * SHUT_RD * Disables further receive operations. No SCTP * protocol action is taken. * SHUT_WR * Disables further send operations, and initiates * the SCTP shutdown sequence. * SHUT_RDWR * Disables further send and receive operations * and initiates the SCTP shutdown sequence. */ static void sctp_shutdown(struct sock *sk, int how) { struct net *net = sock_net(sk); struct sctp_endpoint *ep; if (!sctp_style(sk, TCP)) return; ep = sctp_sk(sk)->ep; if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) { struct sctp_association *asoc; inet_sk_set_state(sk, SCTP_SS_CLOSING); asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); sctp_primitive_SHUTDOWN(net, asoc, NULL); } } int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info) { struct sctp_transport *prim; struct list_head *pos; int mask; memset(info, 0, sizeof(*info)); if (!asoc) { struct sctp_sock *sp = sctp_sk(sk); info->sctpi_s_autoclose = sp->autoclose; info->sctpi_s_adaptation_ind = sp->adaptation_ind; info->sctpi_s_pd_point = sp->pd_point; info->sctpi_s_nodelay = sp->nodelay; info->sctpi_s_disable_fragments = sp->disable_fragments; info->sctpi_s_v4mapped = sp->v4mapped; info->sctpi_s_frag_interleave = sp->frag_interleave; info->sctpi_s_type = sp->type; return 0; } info->sctpi_tag = asoc->c.my_vtag; info->sctpi_state = asoc->state; info->sctpi_rwnd = asoc->a_rwnd; info->sctpi_unackdata = asoc->unack_data; info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); info->sctpi_instrms = asoc->stream.incnt; info->sctpi_outstrms = asoc->stream.outcnt; list_for_each(pos, &asoc->base.inqueue.in_chunk_list) info->sctpi_inqueue++; list_for_each(pos, &asoc->outqueue.out_chunk_list) info->sctpi_outqueue++; info->sctpi_overall_error = asoc->overall_error_count; info->sctpi_max_burst = asoc->max_burst; info->sctpi_maxseg = asoc->frag_point; info->sctpi_peer_rwnd = asoc->peer.rwnd; info->sctpi_peer_tag = asoc->c.peer_vtag; mask = asoc->peer.ecn_capable << 1; mask = (mask | asoc->peer.ipv4_address) << 1; mask = (mask | asoc->peer.ipv6_address) << 1; mask = (mask | asoc->peer.hostname_address) << 1; mask = (mask | asoc->peer.asconf_capable) << 1; mask = (mask | asoc->peer.prsctp_capable) << 1; mask = (mask | asoc->peer.auth_capable); info->sctpi_peer_capable = mask; mask = asoc->peer.sack_needed << 1; mask = (mask | asoc->peer.sack_generation) << 1; mask = (mask | asoc->peer.zero_window_announced); info->sctpi_peer_sack = mask; info->sctpi_isacks = asoc->stats.isacks; info->sctpi_osacks = asoc->stats.osacks; info->sctpi_opackets = asoc->stats.opackets; info->sctpi_ipackets = asoc->stats.ipackets; info->sctpi_rtxchunks = asoc->stats.rtxchunks; info->sctpi_outofseqtsns = asoc->stats.outofseqtsns; info->sctpi_idupchunks = asoc->stats.idupchunks; info->sctpi_gapcnt = asoc->stats.gapcnt; info->sctpi_ouodchunks = asoc->stats.ouodchunks; info->sctpi_iuodchunks = asoc->stats.iuodchunks; info->sctpi_oodchunks = asoc->stats.oodchunks; info->sctpi_iodchunks = asoc->stats.iodchunks; info->sctpi_octrlchunks = asoc->stats.octrlchunks; info->sctpi_ictrlchunks = asoc->stats.ictrlchunks; prim = asoc->peer.primary_path; memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr)); info->sctpi_p_state = prim->state; info->sctpi_p_cwnd = prim->cwnd; info->sctpi_p_srtt = prim->srtt; info->sctpi_p_rto = jiffies_to_msecs(prim->rto); info->sctpi_p_hbinterval = prim->hbinterval; info->sctpi_p_pathmaxrxt = prim->pathmaxrxt; info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay); info->sctpi_p_ssthresh = prim->ssthresh; info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked; info->sctpi_p_flight_size = prim->flight_size; info->sctpi_p_error = prim->error_count; return 0; } EXPORT_SYMBOL_GPL(sctp_get_sctp_info); /* use callback to avoid exporting the core structure */ void sctp_transport_walk_start(struct rhashtable_iter *iter) __acquires(RCU) { rhltable_walk_enter(&sctp_transport_hashtable, iter); rhashtable_walk_start(iter); } void sctp_transport_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { rhashtable_walk_stop(iter); rhashtable_walk_exit(iter); } struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter) { struct sctp_transport *t; t = rhashtable_walk_next(iter); for (; t; t = rhashtable_walk_next(iter)) { if (IS_ERR(t)) { if (PTR_ERR(t) == -EAGAIN) continue; break; } if (!sctp_transport_hold(t)) continue; if (net_eq(t->asoc->base.net, net) && t->asoc->peer.primary_path == t) break; sctp_transport_put(t); } return t; } struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos) { struct sctp_transport *t; if (!pos) return SEQ_START_TOKEN; while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) { if (!--pos) break; sctp_transport_put(t); } return t; } int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p) { int err = 0; int hash = 0; struct sctp_endpoint *ep; struct sctp_hashbucket *head; for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize; hash++, head++) { read_lock_bh(&head->lock); sctp_for_each_hentry(ep, &head->chain) { err = cb(ep, p); if (err) break; } read_unlock_bh(&head->lock); } return err; } EXPORT_SYMBOL_GPL(sctp_for_each_endpoint); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, void *p) { struct sctp_transport *transport; struct sctp_endpoint *ep; int err = -ENOENT; rcu_read_lock(); transport = sctp_addrs_lookup_transport(net, laddr, paddr); if (!transport) { rcu_read_unlock(); return err; } ep = transport->asoc->ep; if (!sctp_endpoint_hold(ep)) { /* asoc can be peeled off */ sctp_transport_put(transport); rcu_read_unlock(); return err; } rcu_read_unlock(); err = cb(ep, transport, p); sctp_endpoint_put(ep); sctp_transport_put(transport); return err; } EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p) { struct rhashtable_iter hti; struct sctp_transport *tsp; struct sctp_endpoint *ep; int ret; again: ret = 0; sctp_transport_walk_start(&hti); tsp = sctp_transport_get_idx(net, &hti, *pos + 1); for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { ep = tsp->asoc->ep; if (sctp_endpoint_hold(ep)) { /* asoc can be peeled off */ ret = cb(ep, tsp, p); if (ret) break; sctp_endpoint_put(ep); } (*pos)++; sctp_transport_put(tsp); } sctp_transport_walk_stop(&hti); if (ret) { if (cb_done && !cb_done(ep, tsp, p)) { (*pos)++; sctp_endpoint_put(ep); sctp_transport_put(tsp); goto again; } sctp_endpoint_put(ep); sctp_transport_put(tsp); } return ret; } EXPORT_SYMBOL_GPL(sctp_transport_traverse_process); /* 7.2.1 Association Status (SCTP_STATUS) * Applications can retrieve current status information about an * association, including association state, peer receiver window size, * number of unacked data chunks, and number of data chunks pending * receipt. This information is read-only. */ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_status status; struct sctp_association *asoc = NULL; struct sctp_transport *transport; sctp_assoc_t associd; int retval = 0; if (len < sizeof(status)) { retval = -EINVAL; goto out; } len = sizeof(status); if (copy_from_user(&status, optval, len)) { retval = -EFAULT; goto out; } associd = status.sstat_assoc_id; asoc = sctp_id2assoc(sk, associd); if (!asoc) { retval = -EINVAL; goto out; } transport = asoc->peer.primary_path; status.sstat_assoc_id = sctp_assoc2id(asoc); status.sstat_state = sctp_assoc_to_state(asoc); status.sstat_rwnd = asoc->peer.rwnd; status.sstat_unackdata = asoc->unack_data; status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); status.sstat_instrms = asoc->stream.incnt; status.sstat_outstrms = asoc->stream.outcnt; status.sstat_fragmentation_point = asoc->frag_point; status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc); memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr, transport->af_specific->sockaddr_len); /* Map ipv4 address into v4-mapped-on-v6 address. */ sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk), (union sctp_addr *)&status.sstat_primary.spinfo_address); status.sstat_primary.spinfo_state = transport->state; status.sstat_primary.spinfo_cwnd = transport->cwnd; status.sstat_primary.spinfo_srtt = transport->srtt; status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); status.sstat_primary.spinfo_mtu = transport->pathmtu; if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) status.sstat_primary.spinfo_state = SCTP_ACTIVE; if (put_user(len, optlen)) { retval = -EFAULT; goto out; } pr_debug("%s: len:%d, state:%d, rwnd:%d, assoc_id:%d\n", __func__, len, status.sstat_state, status.sstat_rwnd, status.sstat_assoc_id); if (copy_to_user(optval, &status, len)) { retval = -EFAULT; goto out; } out: return retval; } /* 7.2.2 Peer Address Information (SCTP_GET_PEER_ADDR_INFO) * * Applications can retrieve information about a specific peer address * of an association, including its reachability state, congestion * window, and retransmission timer values. This information is * read-only. */ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_paddrinfo pinfo; struct sctp_transport *transport; int retval = 0; if (len < sizeof(pinfo)) { retval = -EINVAL; goto out; } len = sizeof(pinfo); if (copy_from_user(&pinfo, optval, len)) { retval = -EFAULT; goto out; } transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, pinfo.spinfo_assoc_id); if (!transport) { retval = -EINVAL; goto out; } if (transport->state == SCTP_PF && transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) { retval = -EACCES; goto out; } pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); pinfo.spinfo_state = transport->state; pinfo.spinfo_cwnd = transport->cwnd; pinfo.spinfo_srtt = transport->srtt; pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); pinfo.spinfo_mtu = transport->pathmtu; if (pinfo.spinfo_state == SCTP_UNKNOWN) pinfo.spinfo_state = SCTP_ACTIVE; if (put_user(len, optlen)) { retval = -EFAULT; goto out; } if (copy_to_user(optval, &pinfo, len)) { retval = -EFAULT; goto out; } out: return retval; } /* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) * * This option is a on/off flag. If enabled no SCTP message * fragmentation will be performed. Instead if a message being sent * exceeds the current PMTU size, the message will NOT be sent and * instead a error will be indicated to the user. */ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = (sctp_sk(sk)->disable_fragments == 1); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* 7.1.15 Set notification and ancillary events (SCTP_EVENTS) * * This socket option is used to specify various notifications and * ancillary data the user wishes to receive. */ static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_event_subscribe subscribe; __u8 *sn_type = (__u8 *)&subscribe; int i; if (len == 0) return -EINVAL; if (len > sizeof(struct sctp_event_subscribe)) len = sizeof(struct sctp_event_subscribe); if (put_user(len, optlen)) return -EFAULT; for (i = 0; i < len; i++) sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe, SCTP_SN_TYPE_BASE + i); if (copy_to_user(optval, &subscribe, len)) return -EFAULT; return 0; } /* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) * * This socket option is applicable to the UDP-style socket only. When * set it will cause associations that are idle for more than the * specified number of seconds to automatically close. An association * being idle is defined an association that has NOT sent or received * user data. The special value of '0' indicates that no automatic * close of any associations should be performed. The option expects an * integer defining the number of seconds of idle time before an * association is closed. */ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optval, int __user *optlen) { /* Applicable to UDP-style socket only */ if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (put_user(len, optlen)) return -EFAULT; if (put_user(sctp_sk(sk)->autoclose, (int __user *)optval)) return -EFAULT; return 0; } /* Helper routine to branch off an association to a new socket. */ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); struct sctp_sock *sp = sctp_sk(sk); struct socket *sock; int err = 0; /* Do not peel off from one netns to another one. */ if (!net_eq(current->nsproxy->net_ns, sock_net(sk))) return -EINVAL; if (!asoc) return -EINVAL; /* An association cannot be branched off from an already peeled-off * socket, nor is this supported for tcp style sockets. */ if (!sctp_style(sk, UDP)) return -EINVAL; /* Create a new socket. */ err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); if (err < 0) return err; sctp_copy_sock(sock->sk, sk, asoc); /* Make peeled-off sockets more like 1-1 accepted sockets. * Set the daddr and initialize id to something more random and also * copy over any ip options. */ sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sock->sk); sp->pf->copy_ip_options(sk, sock->sk); /* Populate the fields of the newsk from the oldsk and migrate the * asoc to the newsk. */ err = sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); if (err) { sock_release(sock); sock = NULL; } *sockp = sock; return err; } EXPORT_SYMBOL(sctp_do_peeloff); static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff, struct file **newfile, unsigned flags) { struct socket *newsock; int retval; retval = sctp_do_peeloff(sk, peeloff->associd, &newsock); if (retval < 0) goto out; /* Map the socket to an unused fd that can be returned to the user. */ retval = get_unused_fd_flags(flags & SOCK_CLOEXEC); if (retval < 0) { sock_release(newsock); goto out; } *newfile = sock_alloc_file(newsock, 0, NULL); if (IS_ERR(*newfile)) { put_unused_fd(retval); retval = PTR_ERR(*newfile); *newfile = NULL; return retval; } pr_debug("%s: sk:%p, newsk:%p, sd:%d\n", __func__, sk, newsock->sk, retval); peeloff->sd = retval; if (flags & SOCK_NONBLOCK) (*newfile)->f_flags |= O_NONBLOCK; out: return retval; } static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen) { sctp_peeloff_arg_t peeloff; struct file *newfile = NULL; int retval = 0; if (len < sizeof(sctp_peeloff_arg_t)) return -EINVAL; len = sizeof(sctp_peeloff_arg_t); if (copy_from_user(&peeloff, optval, len)) return -EFAULT; retval = sctp_getsockopt_peeloff_common(sk, &peeloff, &newfile, 0); if (retval < 0) goto out; /* Return the fd mapped to the new socket. */ if (put_user(len, optlen)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } if (copy_to_user(optval, &peeloff, len)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } fd_install(retval, newfile); out: return retval; } static int sctp_getsockopt_peeloff_flags(struct sock *sk, int len, char __user *optval, int __user *optlen) { sctp_peeloff_flags_arg_t peeloff; struct file *newfile = NULL; int retval = 0; if (len < sizeof(sctp_peeloff_flags_arg_t)) return -EINVAL; len = sizeof(sctp_peeloff_flags_arg_t); if (copy_from_user(&peeloff, optval, len)) return -EFAULT; retval = sctp_getsockopt_peeloff_common(sk, &peeloff.p_arg, &newfile, peeloff.flags); if (retval < 0) goto out; /* Return the fd mapped to the new socket. */ if (put_user(len, optlen)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } if (copy_to_user(optval, &peeloff, len)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } fd_install(retval, newfile); out: return retval; } /* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) * * Applications can enable or disable heartbeats for any peer address of * an association, modify an address's heartbeat interval, force a * heartbeat to be sent immediately, and adjust the address's maximum * number of retransmissions sent before an address is considered * unreachable. The following structure is used to access and modify an * address's parameters: * * struct sctp_paddrparams { * sctp_assoc_t spp_assoc_id; * struct sockaddr_storage spp_address; * uint32_t spp_hbinterval; * uint16_t spp_pathmaxrxt; * uint32_t spp_pathmtu; * uint32_t spp_sackdelay; * uint32_t spp_flags; * }; * * spp_assoc_id - (one-to-many style socket) This is filled in the * application, and identifies the association for * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, * in milliseconds. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be * considered unreachable. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmtu - When Path MTU discovery is disabled the value * specified here will be the "fixed" path mtu. * Note that if the spp_address field is empty * then all associations on this address will * have this fixed path mtu set upon them. * * spp_sackdelay - When delayed sack is enabled, this value specifies * the number of milliseconds that sacks will be delayed * for. This value will apply to all addresses of an * association if the spp_address field is empty. Note * also, that if delayed sack is enabled and this * value is set to 0, no change is made to the last * recorded delayed sack timer value. * * spp_flags - These flags are used to control various features * on an association. The flag field may contain * zero or more of the following options. * * SPP_HB_ENABLE - Enable heartbeats on the * specified address. Note that if the address * field is empty all addresses for the association * have heartbeats enabled upon them. * * SPP_HB_DISABLE - Disable heartbeats on the * speicifed address. Note that if the address * field is empty all addresses for the association * will have their heartbeats disabled. Note also * that SPP_HB_ENABLE and SPP_HB_DISABLE are * mutually exclusive, only one of these two should * be specified. Enabling both fields will have * undetermined results. * * SPP_HB_DEMAND - Request a user initiated heartbeat * to be made immediately. * * SPP_PMTUD_ENABLE - This field will enable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. * * SPP_PMTUD_DISABLE - This field will disable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. Not also that * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually * exclusive. Enabling both will have undetermined * results. * * SPP_SACKDELAY_ENABLE - Setting this flag turns * on delayed sack. The time specified in spp_sackdelay * is used to specify the sack delay for this address. Note * that if spp_address is empty then all addresses will * enable delayed sack and take on the sack delay * value specified in spp_sackdelay. * SPP_SACKDELAY_DISABLE - Setting this flag turns * off delayed sack. If the spp_address field is blank then * delayed sack is disabled for the entire association. Note * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. * * SPP_IPV6_FLOWLABEL: Setting this flag enables the * setting of the IPV6 flow label value. The value is * contained in the spp_ipv6_flowlabel field. * Upon retrieval, this flag will be set to indicate that * the spp_ipv6_flowlabel field has a valid value returned. * If a specific destination address is set (in the * spp_address field), then the value returned is that of * the address. If just an association is specified (and * no address), then the association's default flow label * is returned. If neither an association nor a destination * is specified, then the socket's default flow label is * returned. For non-IPv6 sockets, this flag will be left * cleared. * * SPP_DSCP: Setting this flag enables the setting of the * Differentiated Services Code Point (DSCP) value * associated with either the association or a specific * address. The value is obtained in the spp_dscp field. * Upon retrieval, this flag will be set to indicate that * the spp_dscp field has a valid value returned. If a * specific destination address is set when called (in the * spp_address field), then that specific destination * address's DSCP value is returned. If just an association * is specified, then the association's default DSCP is * returned. If neither an association nor a destination is * specified, then the socket's default DSCP is returned. * * spp_ipv6_flowlabel * - This field is used in conjunction with the * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. * The 20 least significant bits are used for the flow * label. This setting has precedence over any IPv6-layer * setting. * * spp_dscp - This field is used in conjunction with the SPP_DSCP flag * and contains the DSCP. The 6 most significant bits are * used for the DSCP. This setting has precedence over any * IPv4- or IPv6- layer setting. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_paddrparams params; struct sctp_transport *trans = NULL; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); if (len >= sizeof(params)) len = sizeof(params); else if (len >= ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4)) len = ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4); else return -EINVAL; if (copy_from_user(&params, optval, len)) return -EFAULT; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)&params.spp_address)) { trans = sctp_addr_id2transport(sk, &params.spp_address, params.spp_assoc_id); if (!trans) { pr_debug("%s: failed no transport\n", __func__); return -EINVAL; } } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params.spp_assoc_id); if (!asoc && params.spp_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { pr_debug("%s: failed no association\n", __func__); return -EINVAL; } if (trans) { /* Fetch transport values. */ params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); params.spp_pathmtu = trans->pathmtu; params.spp_pathmaxrxt = trans->pathmaxrxt; params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = trans->param_flags; if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) { params.spp_ipv6_flowlabel = trans->flowlabel & SCTP_FLOWLABEL_VAL_MASK; params.spp_flags |= SPP_IPV6_FLOWLABEL; } if (trans->dscp & SCTP_DSCP_SET_MASK) { params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK; params.spp_flags |= SPP_DSCP; } } else if (asoc) { /* Fetch association values. */ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); params.spp_pathmtu = asoc->pathmtu; params.spp_pathmaxrxt = asoc->pathmaxrxt; params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = asoc->param_flags; if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) { params.spp_ipv6_flowlabel = asoc->flowlabel & SCTP_FLOWLABEL_VAL_MASK; params.spp_flags |= SPP_IPV6_FLOWLABEL; } if (asoc->dscp & SCTP_DSCP_SET_MASK) { params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK; params.spp_flags |= SPP_DSCP; } } else { /* Fetch socket values. */ params.spp_hbinterval = sp->hbinterval; params.spp_pathmtu = sp->pathmtu; params.spp_sackdelay = sp->sackdelay; params.spp_pathmaxrxt = sp->pathmaxrxt; /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = sp->param_flags; if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) { params.spp_ipv6_flowlabel = sp->flowlabel & SCTP_FLOWLABEL_VAL_MASK; params.spp_flags |= SPP_IPV6_FLOWLABEL; } if (sp->dscp & SCTP_DSCP_SET_MASK) { params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK; params.spp_flags |= SPP_DSCP; } } if (copy_to_user(optval, &params, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } /* * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK) * * This option will effect the way delayed acks are performed. This * option allows you to get or set the delayed ack time, in * milliseconds. It also allows changing the delayed ack frequency. * Changing the frequency to 1 disables the delayed sack algorithm. If * the assoc_id is 0, then this sets or gets the endpoints default * values. If the assoc_id field is non-zero, then the set or get * effects the specified association for the one to many model (the * assoc_id field is ignored by the one to one model). Note that if * sack_delay or sack_freq are 0 when setting this option, then the * current values will remain unchanged. * * struct sctp_sack_info { * sctp_assoc_t sack_assoc_id; * uint32_t sack_delay; * uint32_t sack_freq; * }; * * sack_assoc_id - This parameter, indicates which association the user * is performing an action upon. Note that if this field's value is * zero then the endpoints default value is changed (effecting future * associations only). * * sack_delay - This parameter contains the number of milliseconds that * the user is requesting the delayed ACK timer be set to. Note that * this value is defined in the standard to be between 200 and 500 * milliseconds. * * sack_freq - This parameter contains the number of packets that must * be received before a sack is sent without waiting for the delay * timer to expire. The default value for this is 2, setting this * value to 1 will disable the delayed sack algorithm. */ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sack_info params; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); if (len >= sizeof(struct sctp_sack_info)) { len = sizeof(struct sctp_sack_info); if (copy_from_user(&params, optval, len)) return -EFAULT; } else if (len == sizeof(struct sctp_assoc_value)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of struct sctp_assoc_value in delayed_ack socket option.\n" "Use struct sctp_sack_info instead\n", current->comm, task_pid_nr(current)); if (copy_from_user(&params, optval, len)) return -EFAULT; } else return -EINVAL; /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params.sack_assoc_id); if (!asoc && params.sack_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { /* Fetch association values. */ if (asoc->param_flags & SPP_SACKDELAY_ENABLE) { params.sack_delay = jiffies_to_msecs(asoc->sackdelay); params.sack_freq = asoc->sackfreq; } else { params.sack_delay = 0; params.sack_freq = 1; } } else { /* Fetch socket values. */ if (sp->param_flags & SPP_SACKDELAY_ENABLE) { params.sack_delay = sp->sackdelay; params.sack_freq = sp->sackfreq; } else { params.sack_delay = 0; params.sack_freq = 1; } } if (copy_to_user(optval, &params, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association * initialization. The option name argument to setsockopt() and getsockopt() * is SCTP_INITMSG. * * Setting initialization parameters is effective only on an unconnected * socket (for UDP-style sockets only future associations are effected * by the change). With TCP-style sockets, this option is inherited by * sockets derived from a listener socket. */ static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval, int __user *optlen) { if (len < sizeof(struct sctp_initmsg)) return -EINVAL; len = sizeof(struct sctp_initmsg); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len)) return -EFAULT; return 0; } static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_association *asoc; int cnt = 0; struct sctp_getaddrs getaddrs; struct sctp_transport *from; void __user *to; union sctp_addr temp; struct sctp_sock *sp = sctp_sk(sk); int addrlen; size_t space_left; int bytes_copied; if (len < sizeof(struct sctp_getaddrs)) return -EINVAL; if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) return -EFAULT; /* For UDP-style sockets, id specifies the association to query. */ asoc = sctp_id2assoc(sk, getaddrs.assoc_id); if (!asoc) return -EINVAL; to = optval + offsetof(struct sctp_getaddrs, addrs); space_left = len - offsetof(struct sctp_getaddrs, addrs); list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { memcpy(&temp, &from->ipaddr, sizeof(temp)); addrlen = sctp_get_pf_specific(sk->sk_family) ->addr_to_user(sp, &temp); if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) return -EFAULT; to += addrlen; cnt++; space_left -= addrlen; } if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) return -EFAULT; bytes_copied = ((char __user *)to) - optval; if (put_user(bytes_copied, optlen)) return -EFAULT; return 0; } static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, size_t space_left, int *bytes_copied) { struct sctp_sockaddr_entry *addr; union sctp_addr temp; int cnt = 0; int addrlen; struct net *net = sock_net(sk); rcu_read_lock(); list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) { if (!addr->valid) continue; if ((PF_INET == sk->sk_family) && (AF_INET6 == addr->a.sa.sa_family)) continue; if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) && (AF_INET == addr->a.sa.sa_family)) continue; memcpy(&temp, &addr->a, sizeof(temp)); if (!temp.v4.sin_port) temp.v4.sin_port = htons(port); addrlen = sctp_get_pf_specific(sk->sk_family) ->addr_to_user(sctp_sk(sk), &temp); if (space_left < addrlen) { cnt = -ENOMEM; break; } memcpy(to, &temp, addrlen); to += addrlen; cnt++; space_left -= addrlen; *bytes_copied += addrlen; } rcu_read_unlock(); return cnt; } static int sctp_getsockopt_local_addrs(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_bind_addr *bp; struct sctp_association *asoc; int cnt = 0; struct sctp_getaddrs getaddrs; struct sctp_sockaddr_entry *addr; void __user *to; union sctp_addr temp; struct sctp_sock *sp = sctp_sk(sk); int addrlen; int err = 0; size_t space_left; int bytes_copied = 0; void *addrs; void *buf; if (len < sizeof(struct sctp_getaddrs)) return -EINVAL; if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) return -EFAULT; /* * For UDP-style sockets, id specifies the association to query. * If the id field is set to the value '0' then the locally bound * addresses are returned without regard to any particular * association. */ if (0 == getaddrs.assoc_id) { bp = &sctp_sk(sk)->ep->base.bind_addr; } else { asoc = sctp_id2assoc(sk, getaddrs.assoc_id); if (!asoc) return -EINVAL; bp = &asoc->base.bind_addr; } to = optval + offsetof(struct sctp_getaddrs, addrs); space_left = len - offsetof(struct sctp_getaddrs, addrs); addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN); if (!addrs) return -ENOMEM; /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid * addresses from the global local address list. */ if (sctp_list_single_entry(&bp->address_list)) { addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(sk, &addr->a)) { cnt = sctp_copy_laddrs(sk, bp->port, addrs, space_left, &bytes_copied); if (cnt < 0) { err = cnt; goto out; } goto copy_getaddrs; } } buf = addrs; /* Protection on the bound address list is not needed since * in the socket option context we hold a socket lock and * thus the bound address list can't change. */ list_for_each_entry(addr, &bp->address_list, list) { memcpy(&temp, &addr->a, sizeof(temp)); addrlen = sctp_get_pf_specific(sk->sk_family) ->addr_to_user(sp, &temp); if (space_left < addrlen) { err = -ENOMEM; /*fixme: right error?*/ goto out; } memcpy(buf, &temp, addrlen); buf += addrlen; bytes_copied += addrlen; cnt++; space_left -= addrlen; } copy_getaddrs: if (copy_to_user(to, addrs, bytes_copied)) { err = -EFAULT; goto out; } if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) { err = -EFAULT; goto out; } /* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too, * but we can't change it anymore. */ if (put_user(bytes_copied, optlen)) err = -EFAULT; out: kfree(addrs); return err; } /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) * * Requests that the local SCTP stack use the enclosed peer address as * the association primary. The enclosed address must be one of the * association peer's addresses. */ static int sctp_getsockopt_primary_addr(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_prim prim; struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); if (len < sizeof(struct sctp_prim)) return -EINVAL; len = sizeof(struct sctp_prim); if (copy_from_user(&prim, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, prim.ssp_assoc_id); if (!asoc) return -EINVAL; if (!asoc->peer.primary_path) return -ENOTCONN; memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr, asoc->peer.primary_path->af_specific->sockaddr_len); sctp_get_pf_specific(sk->sk_family)->addr_to_user(sp, (union sctp_addr *)&prim.ssp_addr); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &prim, len)) return -EFAULT; return 0; } /* * 7.1.11 Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER) * * Requests that the local endpoint set the specified Adaptation Layer * Indication parameter for all future INIT and INIT-ACK exchanges. */ static int sctp_getsockopt_adaptation_layer(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_setadaptation adaptation; if (len < sizeof(struct sctp_setadaptation)) return -EINVAL; len = sizeof(struct sctp_setadaptation); adaptation.ssb_adaptation_ind = sctp_sk(sk)->adaptation_ind; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &adaptation, len)) return -EFAULT; return 0; } /* * * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) * * Applications that wish to use the sendto() system call may wish to * specify a default set of parameters that would normally be supplied * through the inclusion of ancillary data. This socket option allows * such an application to set the default sctp_sndrcvinfo structure. * The application that wishes to use this socket option simply passes * in to this call the sctp_sndrcvinfo structure defined in Section * 5.2.2) The input parameters accepted by this call include * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, * sinfo_timetolive. The user must provide the sinfo_assoc_id field in * to this call if the caller is using the UDP model. * * For getsockopt, it get the default sctp_sndrcvinfo structure. */ static int sctp_getsockopt_default_send_param(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_sndrcvinfo info; if (len < sizeof(info)) return -EINVAL; len = sizeof(info); if (copy_from_user(&info, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { info.sinfo_stream = asoc->default_stream; info.sinfo_flags = asoc->default_flags; info.sinfo_ppid = asoc->default_ppid; info.sinfo_context = asoc->default_context; info.sinfo_timetolive = asoc->default_timetolive; } else { info.sinfo_stream = sp->default_stream; info.sinfo_flags = sp->default_flags; info.sinfo_ppid = sp->default_ppid; info.sinfo_context = sp->default_context; info.sinfo_timetolive = sp->default_timetolive; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &info, len)) return -EFAULT; return 0; } /* RFC6458, Section 8.1.31. Set/get Default Send Parameters * (SCTP_DEFAULT_SNDINFO) */ static int sctp_getsockopt_default_sndinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_sndinfo info; if (len < sizeof(info)) return -EINVAL; len = sizeof(info); if (copy_from_user(&info, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, info.snd_assoc_id); if (!asoc && info.snd_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { info.snd_sid = asoc->default_stream; info.snd_flags = asoc->default_flags; info.snd_ppid = asoc->default_ppid; info.snd_context = asoc->default_context; } else { info.snd_sid = sp->default_stream; info.snd_flags = sp->default_flags; info.snd_ppid = sp->default_ppid; info.snd_context = sp->default_context; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &info, len)) return -EFAULT; return 0; } /* * * 7.1.5 SCTP_NODELAY * * Turn on/off any Nagle-like algorithm. This means that packets are * generally sent as soon as possible and no unnecessary delays are * introduced, at the cost of more packets in the network. Expects an * integer boolean flag. */ static int sctp_getsockopt_nodelay(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = (sctp_sk(sk)->nodelay == 1); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * * 7.1.1 SCTP_RTOINFO * * The protocol parameters used to initialize and bound retransmission * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access * and modify these parameters. * All parameters are time values, in milliseconds. A value of 0, when * modifying the parameters, indicates that the current value should not * be changed. * */ static int sctp_getsockopt_rtoinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_rtoinfo rtoinfo; struct sctp_association *asoc; if (len < sizeof (struct sctp_rtoinfo)) return -EINVAL; len = sizeof(struct sctp_rtoinfo); if (copy_from_user(&rtoinfo, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); if (!asoc && rtoinfo.srto_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Values corresponding to the specific association. */ if (asoc) { rtoinfo.srto_initial = jiffies_to_msecs(asoc->rto_initial); rtoinfo.srto_max = jiffies_to_msecs(asoc->rto_max); rtoinfo.srto_min = jiffies_to_msecs(asoc->rto_min); } else { /* Values corresponding to the endpoint. */ struct sctp_sock *sp = sctp_sk(sk); rtoinfo.srto_initial = sp->rtoinfo.srto_initial; rtoinfo.srto_max = sp->rtoinfo.srto_max; rtoinfo.srto_min = sp->rtoinfo.srto_min; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &rtoinfo, len)) return -EFAULT; return 0; } /* * * 7.1.2 SCTP_ASSOCINFO * * This option is used to tune the maximum retransmission attempts * of the association. * Returns an error if the new association retransmission value is * greater than the sum of the retransmission value of the peer. * See [SCTP] for more information. * */ static int sctp_getsockopt_associnfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assocparams assocparams; struct sctp_association *asoc; struct list_head *pos; int cnt = 0; if (len < sizeof (struct sctp_assocparams)) return -EINVAL; len = sizeof(struct sctp_assocparams); if (copy_from_user(&assocparams, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); if (!asoc && assocparams.sasoc_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Values correspoinding to the specific association */ if (asoc) { assocparams.sasoc_asocmaxrxt = asoc->max_retrans; assocparams.sasoc_peer_rwnd = asoc->peer.rwnd; assocparams.sasoc_local_rwnd = asoc->a_rwnd; assocparams.sasoc_cookie_life = ktime_to_ms(asoc->cookie_life); list_for_each(pos, &asoc->peer.transport_addr_list) { cnt++; } assocparams.sasoc_number_peer_destinations = cnt; } else { /* Values corresponding to the endpoint */ struct sctp_sock *sp = sctp_sk(sk); assocparams.sasoc_asocmaxrxt = sp->assocparams.sasoc_asocmaxrxt; assocparams.sasoc_peer_rwnd = sp->assocparams.sasoc_peer_rwnd; assocparams.sasoc_local_rwnd = sp->assocparams.sasoc_local_rwnd; assocparams.sasoc_cookie_life = sp->assocparams.sasoc_cookie_life; assocparams.sasoc_number_peer_destinations = sp->assocparams. sasoc_number_peer_destinations; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &assocparams, len)) return -EFAULT; return 0; } /* * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) * * This socket option is a boolean flag which turns on or off mapped V4 * addresses. If this option is turned on and the socket is type * PF_INET6, then IPv4 addresses will be mapped to V6 representation. * If this option is turned off, then no mapping will be done of V4 * addresses and a user will receive both PF_INET6 and PF_INET type * addresses on the socket. */ static int sctp_getsockopt_mappedv4(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; struct sctp_sock *sp = sctp_sk(sk); if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = sp->v4mapped; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 7.1.29. Set or Get the default context (SCTP_CONTEXT) * (chapter and verse is quoted at sctp_setsockopt_context()) */ static int sctp_getsockopt_context(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; if (len < sizeof(struct sctp_assoc_value)) return -EINVAL; len = sizeof(struct sctp_assoc_value); if (copy_from_user(&params, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; params.assoc_value = asoc ? asoc->default_rcv_context : sctp_sk(sk)->default_rcv_context; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &params, len)) return -EFAULT; return 0; } /* * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG) * This option will get or set the maximum size to put in any outgoing * SCTP DATA chunk. If a message is larger than this size it will be * fragmented by SCTP into the specified size. Note that the underlying * SCTP implementation may fragment into smaller sized chunks when the * PMTU of the underlying association is smaller than the value set by * the user. The default value for this option is '0' which indicates * the user is NOT limiting fragmentation and only the PMTU will effect * SCTP's choice of DATA chunk size. Note also that values set larger * than the maximum size of an IP datagram will effectively let SCTP * control fragmentation (i.e. the same as setting this option to 0). * * The following structure is used to access and modify this parameter: * * struct sctp_assoc_value { * sctp_assoc_t assoc_id; * uint32_t assoc_value; * }; * * assoc_id: This parameter is ignored for one-to-one style sockets. * For one-to-many style sockets this parameter indicates which * association the user is performing an action upon. Note that if * this field's value is zero then the endpoints default value is * changed (effecting future associations only). * assoc_value: This parameter specifies the maximum size in bytes. */ static int sctp_getsockopt_maxseg(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; if (len == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in maxseg socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); params.assoc_id = SCTP_FUTURE_ASSOC; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); if (copy_from_user(&params, optval, len)) return -EFAULT; } else return -EINVAL; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) params.assoc_value = asoc->frag_point; else params.assoc_value = sctp_sk(sk)->user_frag; if (put_user(len, optlen)) return -EFAULT; if (len == sizeof(int)) { if (copy_to_user(optval, &params.assoc_value, len)) return -EFAULT; } else { if (copy_to_user(optval, &params, len)) return -EFAULT; } return 0; } /* * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) * (chapter and verse is quoted at sctp_setsockopt_fragment_interleave()) */ static int sctp_getsockopt_fragment_interleave(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = sctp_sk(sk)->frag_interleave; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 7.1.25. Set or Get the sctp partial delivery point * (chapter and verse is quoted at sctp_setsockopt_partial_delivery_point()) */ static int sctp_getsockopt_partial_delivery_point(struct sock *sk, int len, char __user *optval, int __user *optlen) { u32 val; if (len < sizeof(u32)) return -EINVAL; len = sizeof(u32); val = sctp_sk(sk)->pd_point; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) * (chapter and verse is quoted at sctp_setsockopt_maxburst()) */ static int sctp_getsockopt_maxburst(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; if (len == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in max_burst socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); params.assoc_id = SCTP_FUTURE_ASSOC; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); if (copy_from_user(&params, optval, len)) return -EFAULT; } else return -EINVAL; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; params.assoc_value = asoc ? asoc->max_burst : sctp_sk(sk)->max_burst; if (len == sizeof(int)) { if (copy_to_user(optval, &params.assoc_value, len)) return -EFAULT; } else { if (copy_to_user(optval, &params, len)) return -EFAULT; } return 0; } static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_hmacalgo __user *p = (void __user *)optval; struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; int i; if (!ep->auth_enable) return -EACCES; hmacs = ep->auth_hmacs_list; data_len = ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr); if (len < sizeof(struct sctp_hmacalgo) + data_len) return -EINVAL; len = sizeof(struct sctp_hmacalgo) + data_len; num_idents = data_len / sizeof(u16); if (put_user(len, optlen)) return -EFAULT; if (put_user(num_idents, &p->shmac_num_idents)) return -EFAULT; for (i = 0; i < num_idents; i++) { __u16 hmacid = ntohs(hmacs->hmac_ids[i]); if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16))) return -EFAULT; } return 0; } static int sctp_getsockopt_active_key(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; if (len < sizeof(struct sctp_authkeyid)) return -EINVAL; len = sizeof(struct sctp_authkeyid); if (copy_from_user(&val, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, val.scact_assoc_id); if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; val.scact_keynumber = asoc->active_key_id; } else { if (!ep->auth_enable) return -EACCES; val.scact_keynumber = ep->active_key_id; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; struct sctp_chunks_param *ch; u32 num_chunks = 0; char __user *to; if (len < sizeof(struct sctp_authchunks)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; asoc = sctp_id2assoc(sk, val.gauth_assoc_id); if (!asoc) return -EINVAL; if (!asoc->peer.auth_capable) return -EACCES; ch = asoc->peer.peer_chunks; if (!ch) goto num; /* See if the user provided enough room for all the data */ num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr); if (len < num_chunks) return -EINVAL; if (copy_to_user(to, ch->chunks, num_chunks)) return -EFAULT; num: len = sizeof(struct sctp_authchunks) + num_chunks; if (put_user(len, optlen)) return -EFAULT; if (put_user(num_chunks, &p->gauth_number_of_chunks)) return -EFAULT; return 0; } static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; struct sctp_chunks_param *ch; u32 num_chunks = 0; char __user *to; if (len < sizeof(struct sctp_authchunks)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; asoc = sctp_id2assoc(sk, val.gauth_assoc_id); if (!asoc && val.gauth_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; ch = (struct sctp_chunks_param *)asoc->c.auth_chunks; } else { if (!ep->auth_enable) return -EACCES; ch = ep->auth_chunk_list; } if (!ch) goto num; num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr); if (len < sizeof(struct sctp_authchunks) + num_chunks) return -EINVAL; if (copy_to_user(to, ch->chunks, num_chunks)) return -EFAULT; num: len = sizeof(struct sctp_authchunks) + num_chunks; if (put_user(len, optlen)) return -EFAULT; if (put_user(num_chunks, &p->gauth_number_of_chunks)) return -EFAULT; return 0; } /* * 8.2.5. Get the Current Number of Associations (SCTP_GET_ASSOC_NUMBER) * This option gets the current number of associations that are attached * to a one-to-many style socket. The option value is an uint32_t. */ static int sctp_getsockopt_assoc_number(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; u32 val = 0; if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (len < sizeof(u32)) return -EINVAL; len = sizeof(u32); list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { val++; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 8.1.23 SCTP_AUTO_ASCONF * See the corresponding setsockopt entry as description */ static int sctp_getsockopt_auto_asconf(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val = 0; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (sctp_sk(sk)->do_auto_asconf && sctp_is_ep_boundall(sk)) val = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 8.2.6. Get the Current Identifiers of Associations * (SCTP_GET_ASSOC_ID_LIST) * * This option gets the current list of SCTP association identifiers of * the SCTP associations handled by a one-to-many style socket. */ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_assoc_ids *ids; size_t ids_size; u32 num = 0; if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (len < sizeof(struct sctp_assoc_ids)) return -EINVAL; list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { num++; } ids_size = struct_size(ids, gaids_assoc_id, num); if (len < ids_size) return -EINVAL; len = ids_size; ids = kmalloc(len, GFP_USER | __GFP_NOWARN); if (unlikely(!ids)) return -ENOMEM; ids->gaids_number_of_ids = num; num = 0; list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { ids->gaids_assoc_id[num++] = asoc->assoc_id; } if (put_user(len, optlen) || copy_to_user(optval, ids, len)) { kfree(ids); return -EFAULT; } kfree(ids); return 0; } /* * SCTP_PEER_ADDR_THLDS * * This option allows us to fetch the partially failed threshold for one or all * transports in an association. See Section 6.1 of: * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, char __user *optval, int len, int __user *optlen, bool v2) { struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; int min; min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); if (len < min) return -EINVAL; len = min; if (copy_from_user(&val, optval, len)) return -EFAULT; if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { trans = sctp_addr_id2transport(sk, &val.spt_address, val.spt_assoc_id); if (!trans) return -ENOENT; val.spt_pathmaxrxt = trans->pathmaxrxt; val.spt_pathpfthld = trans->pf_retrans; val.spt_pathcpthld = trans->ps_retrans; goto out; } asoc = sctp_id2assoc(sk, val.spt_assoc_id); if (!asoc && val.spt_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { val.spt_pathpfthld = asoc->pf_retrans; val.spt_pathmaxrxt = asoc->pathmaxrxt; val.spt_pathcpthld = asoc->ps_retrans; } else { struct sctp_sock *sp = sctp_sk(sk); val.spt_pathpfthld = sp->pf_retrans; val.spt_pathmaxrxt = sp->pathmaxrxt; val.spt_pathcpthld = sp->ps_retrans; } out: if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * SCTP_GET_ASSOC_STATS * * This option retrieves local per endpoint statistics. It is modeled * after OpenSolaris' implementation */ static int sctp_getsockopt_assoc_stats(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_stats sas; struct sctp_association *asoc = NULL; /* User must provide at least the assoc id */ if (len < sizeof(sctp_assoc_t)) return -EINVAL; /* Allow the struct to grow and fill in as much as possible */ len = min_t(size_t, len, sizeof(sas)); if (copy_from_user(&sas, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, sas.sas_assoc_id); if (!asoc) return -EINVAL; sas.sas_rtxchunks = asoc->stats.rtxchunks; sas.sas_gapcnt = asoc->stats.gapcnt; sas.sas_outofseqtsns = asoc->stats.outofseqtsns; sas.sas_osacks = asoc->stats.osacks; sas.sas_isacks = asoc->stats.isacks; sas.sas_octrlchunks = asoc->stats.octrlchunks; sas.sas_ictrlchunks = asoc->stats.ictrlchunks; sas.sas_oodchunks = asoc->stats.oodchunks; sas.sas_iodchunks = asoc->stats.iodchunks; sas.sas_ouodchunks = asoc->stats.ouodchunks; sas.sas_iuodchunks = asoc->stats.iuodchunks; sas.sas_idupchunks = asoc->stats.idupchunks; sas.sas_opackets = asoc->stats.opackets; sas.sas_ipackets = asoc->stats.ipackets; /* New high max rto observed, will return 0 if not a single * RTO update took place. obs_rto_ipaddr will be bogus * in such a case */ sas.sas_maxrto = asoc->stats.max_obs_rto; memcpy(&sas.sas_obs_rto_ipaddr, &asoc->stats.obs_rto_ipaddr, sizeof(struct sockaddr_storage)); /* Mark beginning of a new observation period */ asoc->stats.max_obs_rto = asoc->rto_min; if (put_user(len, optlen)) return -EFAULT; pr_debug("%s: len:%d, assoc_id:%d\n", __func__, len, sas.sas_assoc_id); if (copy_to_user(optval, &sas, len)) return -EFAULT; return 0; } static int sctp_getsockopt_recvrcvinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val = 0; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (sctp_sk(sk)->recvrcvinfo) val = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val = 0; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (sctp_sk(sk)->recvnxtinfo) val = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_pr_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.prsctp_capable : sctp_sk(sk)->ep->prsctp_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_default_prinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_default_prinfo info; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(info)) { retval = -EINVAL; goto out; } len = sizeof(info); if (copy_from_user(&info, optval, len)) goto out; asoc = sctp_id2assoc(sk, info.pr_assoc_id); if (!asoc && info.pr_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } if (asoc) { info.pr_policy = SCTP_PR_POLICY(asoc->default_flags); info.pr_value = asoc->default_timetolive; } else { struct sctp_sock *sp = sctp_sk(sk); info.pr_policy = SCTP_PR_POLICY(sp->default_flags); info.pr_value = sp->default_timetolive; } if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &info, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_prstatus params; struct sctp_association *asoc; int policy; int retval = -EINVAL; if (len < sizeof(params)) goto out; len = sizeof(params); if (copy_from_user(&params, optval, len)) { retval = -EFAULT; goto out; } policy = params.sprstat_policy; if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc) goto out; if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += asoc->abandoned_unsent[policy]; params.sprstat_abandoned_sent += asoc->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = asoc->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen)) { retval = -EFAULT; goto out; } if (copy_to_user(optval, &params, len)) { retval = -EFAULT; goto out; } retval = 0; out: return retval; } static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_stream_out_ext *streamoute; struct sctp_association *asoc; struct sctp_prstatus params; int retval = -EINVAL; int policy; if (len < sizeof(params)) goto out; len = sizeof(params); if (copy_from_user(&params, optval, len)) { retval = -EFAULT; goto out; } policy = params.sprstat_policy; if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; streamoute = SCTP_SO(&asoc->stream, params.sprstat_sid)->ext; if (!streamoute) { /* Not allocated yet, means all stats are 0 */ params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; retval = 0; goto out; } if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += streamoute->abandoned_unsent[policy]; params.sprstat_abandoned_sent += streamoute->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen) || copy_to_user(optval, &params, len)) { retval = -EFAULT; goto out; } retval = 0; out: return retval; } static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.reconf_capable : sctp_sk(sk)->ep->reconf_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_enable_strreset(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->strreset_enable : sctp_sk(sk)->ep->strreset_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_scheduler(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? sctp_sched_get_sched(asoc) : sctp_sk(sk)->default_ss; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_scheduler_value(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_stream_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc) { retval = -EINVAL; goto out; } retval = sctp_sched_get_value(asoc, params.stream_id, &params.stream_value); if (retval) goto out; if (put_user(len, optlen)) { retval = -EFAULT; goto out; } if (copy_to_user(optval, &params, len)) { retval = -EFAULT; goto out; } out: return retval; } static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.intl_capable : sctp_sk(sk)->ep->intl_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_reuse_port(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = sctp_sk(sk)->reuse; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_association *asoc; struct sctp_event param; __u16 subscribe; if (len < sizeof(param)) return -EINVAL; len = sizeof(param); if (copy_from_user(&param, optval, len)) return -EFAULT; if (param.se_type < SCTP_SN_TYPE_BASE || param.se_type > SCTP_SN_TYPE_MAX) return -EINVAL; asoc = sctp_id2assoc(sk, param.se_assoc_id); if (!asoc && param.se_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe; param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &param, len)) return -EFAULT; return 0; } static int sctp_getsockopt_asconf_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.asconf_capable : sctp_sk(sk)->ep->asconf_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_auth_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.auth_capable : sctp_sk(sk)->ep->auth_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_ecn_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.ecn_capable : sctp_sk(sk)->ep->ecn_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_pf_expose(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(&params, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->pf_expose : sctp_sk(sk)->pf_expose; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &params, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_encap_port(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_association *asoc; struct sctp_udpencaps encap; struct sctp_transport *t; __be16 encap_port; if (len < sizeof(encap)) return -EINVAL; len = sizeof(encap); if (copy_from_user(&encap, optval, len)) return -EFAULT; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)&encap.sue_address)) { t = sctp_addr_id2transport(sk, &encap.sue_address, encap.sue_assoc_id); if (!t) { pr_debug("%s: failed no transport\n", __func__); return -EINVAL; } encap_port = t->encap_port; goto out; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, encap.sue_assoc_id); if (!asoc && encap.sue_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { pr_debug("%s: failed no association\n", __func__); return -EINVAL; } if (asoc) { encap_port = asoc->encap_port; goto out; } encap_port = sctp_sk(sk)->encap_port; out: encap.sue_port = (__force uint16_t)encap_port; if (copy_to_user(optval, &encap, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int sctp_getsockopt_probe_interval(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_probeinterval params; struct sctp_association *asoc; struct sctp_transport *t; __u32 probe_interval; if (len < sizeof(params)) return -EINVAL; len = sizeof(params); if (copy_from_user(&params, optval, len)) return -EFAULT; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)&params.spi_address)) { t = sctp_addr_id2transport(sk, &params.spi_address, params.spi_assoc_id); if (!t) { pr_debug("%s: failed no transport\n", __func__); return -EINVAL; } probe_interval = jiffies_to_msecs(t->probe_interval); goto out; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params.spi_assoc_id); if (!asoc && params.spi_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { pr_debug("%s: failed no association\n", __func__); return -EINVAL; } if (asoc) { probe_interval = jiffies_to_msecs(asoc->probe_interval); goto out; } probe_interval = sctp_sk(sk)->probe_interval; out: params.spi_interval = probe_interval; if (copy_to_user(optval, &params, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int retval = 0; int len; pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname); /* I can hardly begin to describe how wrong this is. This is * so broken as to be worse than useless. The API draft * REALLY is NOT helpful here... I am not convinced that the * semantics of getsockopt() with a level OTHER THAN SOL_SCTP * are at all well-founded. */ if (level != SOL_SCTP) { struct sctp_af *af = sctp_sk(sk)->pf->af; retval = af->getsockopt(sk, level, optname, optval, optlen); return retval; } if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; lock_sock(sk); switch (optname) { case SCTP_STATUS: retval = sctp_getsockopt_sctp_status(sk, len, optval, optlen); break; case SCTP_DISABLE_FRAGMENTS: retval = sctp_getsockopt_disable_fragments(sk, len, optval, optlen); break; case SCTP_EVENTS: retval = sctp_getsockopt_events(sk, len, optval, optlen); break; case SCTP_AUTOCLOSE: retval = sctp_getsockopt_autoclose(sk, len, optval, optlen); break; case SCTP_SOCKOPT_PEELOFF: retval = sctp_getsockopt_peeloff(sk, len, optval, optlen); break; case SCTP_SOCKOPT_PEELOFF_FLAGS: retval = sctp_getsockopt_peeloff_flags(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_PARAMS: retval = sctp_getsockopt_peer_addr_params(sk, len, optval, optlen); break; case SCTP_DELAYED_SACK: retval = sctp_getsockopt_delayed_ack(sk, len, optval, optlen); break; case SCTP_INITMSG: retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); break; case SCTP_GET_PEER_ADDRS: retval = sctp_getsockopt_peer_addrs(sk, len, optval, optlen); break; case SCTP_GET_LOCAL_ADDRS: retval = sctp_getsockopt_local_addrs(sk, len, optval, optlen); break; case SCTP_SOCKOPT_CONNECTX3: retval = sctp_getsockopt_connectx3(sk, len, optval, optlen); break; case SCTP_DEFAULT_SEND_PARAM: retval = sctp_getsockopt_default_send_param(sk, len, optval, optlen); break; case SCTP_DEFAULT_SNDINFO: retval = sctp_getsockopt_default_sndinfo(sk, len, optval, optlen); break; case SCTP_PRIMARY_ADDR: retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen); break; case SCTP_NODELAY: retval = sctp_getsockopt_nodelay(sk, len, optval, optlen); break; case SCTP_RTOINFO: retval = sctp_getsockopt_rtoinfo(sk, len, optval, optlen); break; case SCTP_ASSOCINFO: retval = sctp_getsockopt_associnfo(sk, len, optval, optlen); break; case SCTP_I_WANT_MAPPED_V4_ADDR: retval = sctp_getsockopt_mappedv4(sk, len, optval, optlen); break; case SCTP_MAXSEG: retval = sctp_getsockopt_maxseg(sk, len, optval, optlen); break; case SCTP_GET_PEER_ADDR_INFO: retval = sctp_getsockopt_peer_addr_info(sk, len, optval, optlen); break; case SCTP_ADAPTATION_LAYER: retval = sctp_getsockopt_adaptation_layer(sk, len, optval, optlen); break; case SCTP_CONTEXT: retval = sctp_getsockopt_context(sk, len, optval, optlen); break; case SCTP_FRAGMENT_INTERLEAVE: retval = sctp_getsockopt_fragment_interleave(sk, len, optval, optlen); break; case SCTP_PARTIAL_DELIVERY_POINT: retval = sctp_getsockopt_partial_delivery_point(sk, len, optval, optlen); break; case SCTP_MAX_BURST: retval = sctp_getsockopt_maxburst(sk, len, optval, optlen); break; case SCTP_AUTH_KEY: case SCTP_AUTH_CHUNK: case SCTP_AUTH_DELETE_KEY: case SCTP_AUTH_DEACTIVATE_KEY: retval = -EOPNOTSUPP; break; case SCTP_HMAC_IDENT: retval = sctp_getsockopt_hmac_ident(sk, len, optval, optlen); break; case SCTP_AUTH_ACTIVE_KEY: retval = sctp_getsockopt_active_key(sk, len, optval, optlen); break; case SCTP_PEER_AUTH_CHUNKS: retval = sctp_getsockopt_peer_auth_chunks(sk, len, optval, optlen); break; case SCTP_LOCAL_AUTH_CHUNKS: retval = sctp_getsockopt_local_auth_chunks(sk, len, optval, optlen); break; case SCTP_GET_ASSOC_NUMBER: retval = sctp_getsockopt_assoc_number(sk, len, optval, optlen); break; case SCTP_GET_ASSOC_ID_LIST: retval = sctp_getsockopt_assoc_ids(sk, len, optval, optlen); break; case SCTP_AUTO_ASCONF: retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen, false); break; case SCTP_PEER_ADDR_THLDS_V2: retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen, true); break; case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); break; case SCTP_RECVRCVINFO: retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen); break; case SCTP_RECVNXTINFO: retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); break; case SCTP_PR_SUPPORTED: retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen); break; case SCTP_DEFAULT_PRINFO: retval = sctp_getsockopt_default_prinfo(sk, len, optval, optlen); break; case SCTP_PR_ASSOC_STATUS: retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, optlen); break; case SCTP_PR_STREAM_STATUS: retval = sctp_getsockopt_pr_streamstatus(sk, len, optval, optlen); break; case SCTP_RECONFIG_SUPPORTED: retval = sctp_getsockopt_reconfig_supported(sk, len, optval, optlen); break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); break; case SCTP_STREAM_SCHEDULER: retval = sctp_getsockopt_scheduler(sk, len, optval, optlen); break; case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_getsockopt_scheduler_value(sk, len, optval, optlen); break; case SCTP_INTERLEAVING_SUPPORTED: retval = sctp_getsockopt_interleaving_supported(sk, len, optval, optlen); break; case SCTP_REUSE_PORT: retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); break; case SCTP_EVENT: retval = sctp_getsockopt_event(sk, len, optval, optlen); break; case SCTP_ASCONF_SUPPORTED: retval = sctp_getsockopt_asconf_supported(sk, len, optval, optlen); break; case SCTP_AUTH_SUPPORTED: retval = sctp_getsockopt_auth_supported(sk, len, optval, optlen); break; case SCTP_ECN_SUPPORTED: retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen); break; case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); break; case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_getsockopt_encap_port(sk, len, optval, optlen); break; case SCTP_PLPMTUD_PROBE_INTERVAL: retval = sctp_getsockopt_probe_interval(sk, len, optval, optlen); break; default: retval = -ENOPROTOOPT; break; } release_sock(sk); return retval; } static bool sctp_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SCTP) { switch (optname) { case SCTP_SOCKOPT_PEELOFF: case SCTP_SOCKOPT_PEELOFF_FLAGS: case SCTP_SOCKOPT_CONNECTX3: return true; default: return false; } } return false; } static int sctp_hash(struct sock *sk) { /* STUB */ return 0; } static void sctp_unhash(struct sock *sk) { /* STUB */ } /* Check if port is acceptable. Possibly find first available port. * * The port hash table (contained in the 'global' SCTP protocol storage * returned by struct sctp_protocol *sctp_get_protocol()). The hash * table is an array of 4096 lists (sctp_bind_hashbucket). Each * list (the list number is the port number hashed out, so as you * would expect from a hash function, all the ports in a given list have * such a number that hashes out to the same list number; you were * expecting that, right?); so each list has a set of ports, with a * link to the socket (struct sock) that uses it, the port number and * a fastreuse flag (FIXME: NPI ipg). */ static struct sctp_bind_bucket *sctp_bucket_create( struct sctp_bind_hashbucket *head, struct net *, unsigned short snum); static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { struct sctp_sock *sp = sctp_sk(sk); bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct net *net = sock_net(sk); kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; unsigned short snum; int ret; snum = ntohs(addr->v4.sin_port); pr_debug("%s: begins, snum:%d\n", __func__, snum); if (snum == 0) { /* Search for an available port. */ int low, high, remaining, index; unsigned int rover; inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = prandom_u32() % remaining + low; do { rover++; if ((rover < low) || (rover > high)) rover = low; if (inet_is_local_reserved_port(net, rover)) continue; index = sctp_phashfn(net, rover); head = &sctp_port_hashtable[index]; spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && net_eq(net, pp->net)) goto next; break; next: spin_unlock_bh(&head->lock); cond_resched(); } while (--remaining > 0); /* Exhausted local port range during search? */ ret = 1; if (remaining <= 0) return ret; /* OK, here is the one we will use. HEAD (the port * hash table list entry) is non-NULL and we hold it's * mutex. */ snum = rover; } else { /* We are given an specific port number; we verify * that it is not being used. If it is used, we will * exahust the search in the hash list corresponding * to the port number (snum) - we detect that with the * port iterator, pp being NULL. */ head = &sctp_port_hashtable[sctp_phashfn(net, snum)]; spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) { if ((pp->port == snum) && net_eq(pp->net, net)) goto pp_found; } } pp = NULL; goto pp_not_found; pp_found: if (!hlist_empty(&pp->owner)) { /* We had a port hash table hit - there is an * available port (pp != NULL) and it is being * used by other socket (pp->owner not empty); that other * socket is going to be sk2. */ struct sock *sk2; pr_debug("%s: found a possible match\n", __func__); if ((pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) || (pp->fastreuseport && sk->sk_reuseport && uid_eq(pp->fastuid, uid))) goto success; /* Run through the list of sockets bound to the port * (pp->port) [via the pointers bind_next and * bind_pprev in the struct sock *sk2 (pp->sk)]. On each one, * we get the endpoint they describe and run through * the endpoint's list of IP (v4 or v6) addresses, * comparing each of the addresses with the address of * the socket sk. If we find a match, then that means * that this port/socket (sk) combination are already * in an endpoint. */ sk_for_each_bound(sk2, &pp->owner) { struct sctp_sock *sp2 = sctp_sk(sk2); struct sctp_endpoint *ep2 = sp2->ep; if (sk == sk2 || (reuse && (sk2->sk_reuse || sp2->reuse) && sk2->sk_state != SCTP_SS_LISTENING) || (sk->sk_reuseport && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)))) continue; if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, sp2, sp)) { ret = 1; goto fail_unlock; } } pr_debug("%s: found a match\n", __func__); } pp_not_found: /* If there was a hash table miss, create a new port. */ ret = 1; if (!pp && !(pp = sctp_bucket_create(head, net, snum))) goto fail_unlock; /* In either case (hit or miss), make sure fastreuse is 1 only * if sk->sk_reuse is too (that is, if the caller requested * SO_REUSEADDR on this socket -sk-). */ if (hlist_empty(&pp->owner)) { if (reuse && sk->sk_state != SCTP_SS_LISTENING) pp->fastreuse = 1; else pp->fastreuse = 0; if (sk->sk_reuseport) { pp->fastreuseport = 1; pp->fastuid = uid; } else { pp->fastreuseport = 0; } } else { if (pp->fastreuse && (!reuse || sk->sk_state == SCTP_SS_LISTENING)) pp->fastreuse = 0; if (pp->fastreuseport && (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid))) pp->fastreuseport = 0; } /* We are set, so fill up all the data in the hash table * entry, tie the socket list information with the rest of the * sockets FIXME: Blurry, NPI (ipg). */ success: if (!sp->bind_hash) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &pp->owner); sp->bind_hash = pp; } ret = 0; fail_unlock: spin_unlock_bh(&head->lock); return ret; } /* Assign a 'snum' port to the socket. If snum == 0, an ephemeral * port is requested. */ static int sctp_get_port(struct sock *sk, unsigned short snum) { union sctp_addr addr; struct sctp_af *af = sctp_sk(sk)->pf->af; /* Set up a dummy address struct from the sk. */ af->from_sk(&addr, sk); addr.v4.sin_port = htons(snum); /* Note: sk->sk_num gets filled in if ephemeral port request. */ return sctp_get_port_local(sk, &addr); } /* * Move a socket to LISTENING state. */ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; struct crypto_shash *tfm = NULL; char alg[32]; int err; /* Allocate HMAC for generating cookie. */ if (!sp->hmac && sp->sctp_hmac_alg) { sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); tfm = crypto_alloc_shash(alg, 0, 0); if (IS_ERR(tfm)) { net_info_ratelimited("failed to load transform for %s: %ld\n", sp->sctp_hmac_alg, PTR_ERR(tfm)); return -ENOSYS; } sctp_sk(sk)->hmac = tfm; } /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system * picks an ephemeral port and will choose an address set equivalent * to binding with a wildcard address. * * This is not currently spelled out in the SCTP sockets * extensions draft, but follows the practice as seen in TCP * sockets. * */ inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) { err = -EAGAIN; goto err; } } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { err = -EADDRINUSE; goto err; } } WRITE_ONCE(sk->sk_max_ack_backlog, backlog); err = sctp_hash_endpoint(ep); if (err) goto err; return 0; err: inet_sk_set_state(sk, SCTP_SS_CLOSED); return err; } /* * 4.1.3 / 5.1.3 listen() * * By default, new associations are not accepted for UDP style sockets. * An application uses listen() to mark a socket as being able to * accept new associations. * * On TCP style sockets, applications use listen() to ready the SCTP * endpoint for accepting inbound associations. * * On both types of endpoints a backlog of '0' disables listening. * * Move a socket to LISTENING state. */ int sctp_inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct sctp_endpoint *ep = sctp_sk(sk)->ep; int err = -EINVAL; if (unlikely(backlog < 0)) return err; lock_sock(sk); /* Peeled-off sockets are not allowed to listen(). */ if (sctp_style(sk, UDP_HIGH_BANDWIDTH)) goto out; if (sock->state != SS_UNCONNECTED) goto out; if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED)) goto out; /* If backlog is zero, disable listening. */ if (!backlog) { if (sctp_sstate(sk, CLOSED)) goto out; err = 0; sctp_unhash_endpoint(ep); sk->sk_state = SCTP_SS_CLOSED; if (sk->sk_reuse || sctp_sk(sk)->reuse) sctp_sk(sk)->bind_hash->fastreuse = 1; goto out; } /* If we are already listening, just update the backlog */ if (sctp_sstate(sk, LISTENING)) WRITE_ONCE(sk->sk_max_ack_backlog, backlog); else { err = sctp_listen_start(sk, backlog); if (err) goto out; } err = 0; out: release_sock(sk); return err; } /* * This function is done by modeling the current datagram_poll() and the * tcp_poll(). Note that, based on these implementations, we don't * lock the socket in this function, even though it seems that, * ideally, locking or some other mechanisms can be used to ensure * the integrity of the counters (sndbuf and wmem_alloc) used * in this place. We assume that we don't need locks either until proven * otherwise. * * Another thing to note is that we include the Async I/O support * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); __poll_t mask; poll_wait(file, sk_sleep(sk), wait); sock_rps_record_flow(sk); /* A TCP-style listening socket becomes readable when the accept queue * is not empty. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) return (!list_empty(&sp->ep->asocs)) ? (EPOLLIN | EPOLLRDNORM) : 0; mask = 0; /* Is there any exceptional events? */ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* Is it readable? Reconsider this code with TCP-style support. */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* The association is either gone or not ready. */ if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED)) return mask; /* Is it writable? */ if (sctp_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* * Since the socket is not locked, the buffer * might be made available after the writeable check and * before the bit is set. This could cause a lost I/O * signal. tcp_poll() has a race breaker for this race * condition. Based on their implementation, we put * in the following code to cover it as well. */ if (sctp_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; } return mask; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ static struct sctp_bind_bucket *sctp_bucket_create( struct sctp_bind_hashbucket *head, struct net *net, unsigned short snum) { struct sctp_bind_bucket *pp; pp = kmem_cache_alloc(sctp_bucket_cachep, GFP_ATOMIC); if (pp) { SCTP_DBG_OBJCNT_INC(bind_bucket); pp->port = snum; pp->fastreuse = 0; INIT_HLIST_HEAD(&pp->owner); pp->net = net; hlist_add_head(&pp->node, &head->chain); } return pp; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ static void sctp_bucket_destroy(struct sctp_bind_bucket *pp) { if (pp && hlist_empty(&pp->owner)) { __hlist_del(&pp->node); kmem_cache_free(sctp_bucket_cachep, pp); SCTP_DBG_OBJCNT_DEC(bind_bucket); } } /* Release this socket's reference to a local port. */ static inline void __sctp_put_port(struct sock *sk) { struct sctp_bind_hashbucket *head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), inet_sk(sk)->inet_num)]; struct sctp_bind_bucket *pp; spin_lock(&head->lock); pp = sctp_sk(sk)->bind_hash; __sk_del_bind_node(sk); sctp_sk(sk)->bind_hash = NULL; inet_sk(sk)->inet_num = 0; sctp_bucket_destroy(pp); spin_unlock(&head->lock); } void sctp_put_port(struct sock *sk) { local_bh_disable(); __sctp_put_port(sk); local_bh_enable(); } /* * The system picks an ephemeral port and choose an address set equivalent * to binding with a wildcard address. * One of those addresses will be the primary address for the association. * This automatically enables the multihoming capability of SCTP. */ static int sctp_autobind(struct sock *sk) { union sctp_addr autoaddr; struct sctp_af *af; __be16 port; /* Initialize a local sockaddr structure to INADDR_ANY. */ af = sctp_sk(sk)->pf->af; port = htons(inet_sk(sk)->inet_num); af->inaddr_any(&autoaddr, port); return sctp_do_bind(sk, &autoaddr, af->sockaddr_len); } /* Parse out IPPROTO_SCTP CMSG headers. Perform only minimal validation. * * From RFC 2292 * 4.2 The cmsghdr Structure * * * When ancillary data is sent or received, any number of ancillary data * objects can be specified by the msg_control and msg_controllen members of * the msghdr structure, because each object is preceded by * a cmsghdr structure defining the object's length (the cmsg_len member). * Historically Berkeley-derived implementations have passed only one object * at a time, but this API allows multiple objects to be * passed in a single call to sendmsg() or recvmsg(). The following example * shows two ancillary data objects in a control buffer. * * |<--------------------------- msg_controllen -------------------------->| * | | * * |<----- ancillary data object ----->|<----- ancillary data object ----->| * * |<---------- CMSG_SPACE() --------->|<---------- CMSG_SPACE() --------->| * | | | * * |<---------- cmsg_len ---------->| |<--------- cmsg_len ----------->| | * * |<--------- CMSG_LEN() --------->| |<-------- CMSG_LEN() ---------->| | * | | | | | * * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ * |cmsg_|cmsg_|cmsg_|XX| |XX|cmsg_|cmsg_|cmsg_|XX| |XX| * * |len |level|type |XX|cmsg_data[]|XX|len |level|type |XX|cmsg_data[]|XX| * * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ * ^ * | * * msg_control * points here */ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) { struct msghdr *my_msg = (struct msghdr *)msg; struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, my_msg) { if (!CMSG_OK(my_msg, cmsg)) return -EINVAL; /* Should we parse this header or ignore? */ if (cmsg->cmsg_level != IPPROTO_SCTP) continue; /* Strictly check lengths following example in SCM code. */ switch (cmsg->cmsg_type) { case SCTP_INIT: /* SCTP Socket API Extension * 5.3.1 SCTP Initiation Structure (SCTP_INIT) * * This cmsghdr structure provides information for * initializing new SCTP associations with sendmsg(). * The SCTP_INITMSG socket option uses this same data * structure. This structure is not used for * recvmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_initmsg))) return -EINVAL; cmsgs->init = CMSG_DATA(cmsg); break; case SCTP_SNDRCV: /* SCTP Socket API Extension * 5.3.2 SCTP Header Information Structure(SCTP_SNDRCV) * * This cmsghdr structure specifies SCTP options for * sendmsg() and describes SCTP header information * about a received message through recvmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) return -EINVAL; cmsgs->srinfo = CMSG_DATA(cmsg); if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; case SCTP_SNDINFO: /* SCTP Socket API Extension * 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO) * * This cmsghdr structure specifies SCTP options for * sendmsg(). This structure and SCTP_RCVINFO replaces * SCTP_SNDRCV which has been deprecated. * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_SNDINFO struct sctp_sndinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndinfo))) return -EINVAL; cmsgs->sinfo = CMSG_DATA(cmsg); if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; case SCTP_PRINFO: /* SCTP Socket API Extension * 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO) * * This cmsghdr structure specifies SCTP options for sendmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_PRINFO struct sctp_prinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_prinfo))) return -EINVAL; cmsgs->prinfo = CMSG_DATA(cmsg); if (cmsgs->prinfo->pr_policy & ~SCTP_PR_SCTP_MASK) return -EINVAL; if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) cmsgs->prinfo->pr_value = 0; break; case SCTP_AUTHINFO: /* SCTP Socket API Extension * 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO) * * This cmsghdr structure specifies SCTP options for sendmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_AUTHINFO struct sctp_authinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_authinfo))) return -EINVAL; cmsgs->authinfo = CMSG_DATA(cmsg); break; case SCTP_DSTADDRV4: case SCTP_DSTADDRV6: /* SCTP Socket API Extension * 5.3.9/10 SCTP Destination IPv4/6 Address Structure (SCTP_DSTADDRV4/6) * * This cmsghdr structure specifies SCTP options for sendmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_DSTADDRV4 struct in_addr * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_DSTADDRV6 struct in6_addr */ cmsgs->addrs_msg = my_msg; break; default: return -EINVAL; } } return 0; } /* * Wait for a packet.. * Note: This function is the same function as in core/datagram.c * with a few modifications to make lksctp work. */ static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p) { int error; DEFINE_WAIT(wait); prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); if (error) goto out; if (!skb_queue_empty(&sk->sk_receive_queue)) goto ready; /* Socket shut down? */ if (sk->sk_shutdown & RCV_SHUTDOWN) goto out; /* Sequenced packets can come disconnected. If so we report the * problem. */ error = -ENOTCONN; /* Is there a good reason to think that we may receive some data? */ if (list_empty(&sctp_sk(sk)->ep->asocs) && !sctp_sstate(sk, LISTENING)) goto out; /* Handle signals. */ if (signal_pending(current)) goto interrupted; /* Let another process have a go. Since we are going to sleep * anyway. Note: This may cause odd behaviors if the message * does not fit in the user's buffer, but this seems to be the * only way to honor MSG_DONTWAIT realistically. */ release_sock(sk); *timeo_p = schedule_timeout(*timeo_p); lock_sock(sk); ready: finish_wait(sk_sleep(sk), &wait); return 0; interrupted: error = sock_intr_errno(*timeo_p); out: finish_wait(sk_sleep(sk), &wait); *err = error; return error; } /* Receive a datagram. * Note: This is pretty much the same routine as in core/datagram.c * with a few changes to make lksctp work. */ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, int noblock, int *err) { int error; struct sk_buff *skb; long timeo; timeo = sock_rcvtimeo(sk, noblock); pr_debug("%s: timeo:%ld, max:%ld\n", __func__, timeo, MAX_SCHEDULE_TIMEOUT); do { /* Again only user level code calls this function, * so nothing interrupt level * will suddenly eat the receive_queue. * * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ if (flags & MSG_PEEK) { skb = skb_peek(&sk->sk_receive_queue); if (skb) refcount_inc(&skb->users); } else { skb = __skb_dequeue(&sk->sk_receive_queue); } if (skb) return skb; /* Caller is allowed not to check sk->sk_err before calling. */ error = sock_error(sk); if (error) goto no_packet; if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk_can_busy_loop(sk)) { sk_busy_loop(sk, noblock); if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) continue; } /* User doesn't want to wait. */ error = -EAGAIN; if (!timeo) goto no_packet; } while (sctp_wait_for_packet(sk, err, &timeo) == 0); return NULL; no_packet: *err = error; return NULL; } /* If sndbuf has changed, wake up per association sndbuf waiters. */ static void __sctp_write_space(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; if (sctp_wspace(asoc) <= 0) return; if (waitqueue_active(&asoc->wait)) wake_up_interruptible(&asoc->wait); if (sctp_writeable(sk)) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (wq) { if (waitqueue_active(&wq->wait)) wake_up_interruptible(&wq->wait); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } } static void sctp_wake_up_waiters(struct sock *sk, struct sctp_association *asoc) { struct sctp_association *tmp = asoc; /* We do accounting for the sndbuf space per association, * so we only need to wake our own association. */ if (asoc->ep->sndbuf_policy) return __sctp_write_space(asoc); /* If association goes down and is just flushing its * outq, then just normally notify others. */ if (asoc->base.dead) return sctp_write_space(sk); /* Accounting for the sndbuf space is per socket, so we * need to wake up others, try to be fair and in case of * other associations, let them have a go first instead * of just doing a sctp_write_space() call. * * Note that we reach sctp_wake_up_waiters() only when * associations free up queued chunks, thus we are under * lock and the list of associations on a socket is * guaranteed not to change. */ for (tmp = list_next_entry(tmp, asocs); 1; tmp = list_next_entry(tmp, asocs)) { /* Manually skip the head element. */ if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs)) continue; /* Wake up association. */ __sctp_write_space(tmp); /* We've reached the end. */ if (tmp == asoc) break; } } /* Do accounting for the sndbuf space. * Decrement the used sndbuf space of the corresponding association by the * data size which was just transmitted(freed). */ static void sctp_wfree(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; struct sctp_association *asoc = chunk->asoc; struct sock *sk = asoc->base.sk; sk_mem_uncharge(sk, skb->truesize); sk_wmem_queued_add(sk, -(skb->truesize + sizeof(struct sctp_chunk))); asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk); WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc)); if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } sock_wfree(skb); sctp_wake_up_waiters(sk, asoc); sctp_association_put(asoc); } /* Do accounting for the receive space on the socket. * Accounting for the association is done in ulpevent.c * We set this as a destructor for the cloned data skbs so that * accounting is done at the correct time. */ void sctp_sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct sctp_ulpevent *event = sctp_skb2event(skb); atomic_sub(event->rmem_len, &sk->sk_rmem_alloc); /* * Mimic the behavior of sock_rfree */ sk_mem_uncharge(sk, event->rmem_len); } /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, size_t msg_len) { struct sock *sk = asoc->base.sk; long current_timeo = *timeo_p; DEFINE_WAIT(wait); int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); /* Increment the association's refcnt. */ sctp_association_hold(asoc); /* Wait on the association specific sndbuf space. */ for (;;) { prepare_to_wait_exclusive(&asoc->wait, &wait, TASK_INTERRUPTIBLE); if (asoc->base.dead) goto do_dead; if (!*timeo_p) goto do_nonblock; if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; if (signal_pending(current)) goto do_interrupted; if (sk_under_memory_pressure(sk)) sk_mem_reclaim(sk); if ((int)msg_len <= sctp_wspace(asoc) && sk_wmem_schedule(sk, msg_len)) break; /* Let another process have a go. Since we are going * to sleep anyway. */ release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); if (sk != asoc->base.sk) goto do_error; *timeo_p = current_timeo; } out: finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ sctp_association_put(asoc); return err; do_dead: err = -ESRCH; goto out; do_error: err = -EPIPE; goto out; do_interrupted: err = sock_intr_errno(*timeo_p); goto out; do_nonblock: err = -EAGAIN; goto out; } void sctp_data_ready(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } /* If socket sndbuf has changed, wake up all per association waiters. */ void sctp_write_space(struct sock *sk) { struct sctp_association *asoc; /* Wake up the tasks in each wait queue. */ list_for_each_entry(asoc, &((sctp_sk(sk))->ep->asocs), asocs) { __sctp_write_space(asoc); } } /* Is there any sndbuf space available on the socket? * * Note that sk_wmem_alloc is the sum of the send buffers on all of the * associations on the same socket. For a UDP-style socket with * multiple associations, it is possible for it to be "unwriteable" * prematurely. I assume that this is acceptable because * a premature "unwriteable" is better than an accidental "writeable" which * would cause an unwanted block under certain circumstances. For the 1-1 * UDP-style sockets or TCP-style sockets, this code should work. * - Daisy */ static bool sctp_writeable(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) > READ_ONCE(sk->sk_wmem_queued); } /* Wait for an association to go into ESTABLISHED state. If timeout is 0, * returns immediately with EINPROGRESS. */ static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p) { struct sock *sk = asoc->base.sk; int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); pr_debug("%s: asoc:%p, timeo:%ld\n", __func__, asoc, *timeo_p); /* Increment the association's refcnt. */ sctp_association_hold(asoc); for (;;) { prepare_to_wait_exclusive(&asoc->wait, &wait, TASK_INTERRUPTIBLE); if (!*timeo_p) goto do_nonblock; if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || asoc->base.dead) goto do_error; if (signal_pending(current)) goto do_interrupted; if (sctp_state(asoc, ESTABLISHED)) break; /* Let another process have a go. Since we are going * to sleep anyway. */ release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); *timeo_p = current_timeo; } out: finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ sctp_association_put(asoc); return err; do_error: if (asoc->init_err_counter + 1 > asoc->max_init_attempts) err = -ETIMEDOUT; else err = -ECONNREFUSED; goto out; do_interrupted: err = sock_intr_errno(*timeo_p); goto out; do_nonblock: err = -EINPROGRESS; goto out; } static int sctp_wait_for_accept(struct sock *sk, long timeo) { struct sctp_endpoint *ep; int err = 0; DEFINE_WAIT(wait); ep = sctp_sk(sk)->ep; for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&ep->asocs)) { release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); } err = -EINVAL; if (!sctp_sstate(sk, LISTENING)) break; err = 0; if (!list_empty(&ep->asocs)) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; } static void sctp_wait_for_close(struct sock *sk, long timeout) { DEFINE_WAIT(wait); do { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&sctp_sk(sk)->ep->asocs)) break; release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } while (!signal_pending(current) && timeout); finish_wait(sk_sleep(sk), &wait); } static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk) { struct sk_buff *frag; if (!skb->data_len) goto done; /* Don't forget the fragments. */ skb_walk_frags(skb, frag) sctp_skb_set_owner_r_frag(frag, sk); done: sctp_skb_set_owner_r(skb, sk); } void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc) { struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; newsk->sk_flags = sk->sk_flags; newsk->sk_tsflags = sk->sk_tsflags; newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; newsk->sk_destruct = sk->sk_destruct; newsk->sk_family = sk->sk_family; newsk->sk_protocol = IPPROTO_SCTP; newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; newsk->sk_sndbuf = sk->sk_sndbuf; newsk->sk_rcvbuf = sk->sk_rcvbuf; newsk->sk_lingertime = sk->sk_lingertime; newsk->sk_rcvtimeo = sk->sk_rcvtimeo; newsk->sk_sndtimeo = sk->sk_sndtimeo; newsk->sk_rxhash = sk->sk_rxhash; newinet = inet_sk(newsk); /* Initialize sk's sport, dport, rcv_saddr and daddr for * getsockname() and getpeername() */ newinet->inet_sport = inet->inet_sport; newinet->inet_saddr = inet->inet_saddr; newinet->inet_rcv_saddr = inet->inet_rcv_saddr; newinet->inet_dport = htons(asoc->peer.port); newinet->pmtudisc = inet->pmtudisc; newinet->inet_id = prandom_u32(); newinet->uc_ttl = inet->uc_ttl; newinet->mc_loop = 1; newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); /* Set newsk security attributes from original sk and connection * security attribute from ep. */ security_sctp_sk_clone(ep, sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, const struct sock *sk_from) { size_t ancestor_size = sizeof(struct inet_sock); ancestor_size += sk_from->sk_prot->obj_size; ancestor_size -= offsetof(struct sctp_sock, pd_lobby); __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); } /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sctp_association *assoc, enum sctp_socket_type type) { struct sctp_sock *oldsp = sctp_sk(oldsk); struct sctp_sock *newsp = sctp_sk(newsk); struct sctp_bind_bucket *pp; /* hash list port iterator */ struct sctp_endpoint *newep = newsp->ep; struct sk_buff *skb, *tmp; struct sctp_ulpevent *event; struct sctp_bind_hashbucket *head; int err; /* Migrate socket buffer sizes and all the socket level options to the * new socket. */ newsk->sk_sndbuf = oldsk->sk_sndbuf; newsk->sk_rcvbuf = oldsk->sk_rcvbuf; /* Brute force copy old sctp opt. */ sctp_copy_descendant(newsk, oldsk); /* Restore the ep value that was overwritten with the above structure * copy. */ newsp->ep = newep; newsp->hmac = NULL; /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), inet_sk(oldsk)->inet_num)]; spin_lock_bh(&head->lock); pp = sctp_sk(oldsk)->bind_hash; sk_add_bind_node(newsk, &pp->owner); sctp_sk(newsk)->bind_hash = pp; inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num; spin_unlock_bh(&head->lock); /* Copy the bind_addr list from the original endpoint to the new * endpoint so that we can handle restarts properly */ err = sctp_bind_addr_dup(&newsp->ep->base.bind_addr, &oldsp->ep->base.bind_addr, GFP_KERNEL); if (err) return err; /* New ep's auth_hmacs should be set if old ep's is set, in case * that net->sctp.auth_enable has been changed to 0 by users and * new ep's auth_hmacs couldn't be set in sctp_endpoint_init(). */ if (oldsp->ep->auth_hmacs) { err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL); if (err) return err; } sctp_auto_asconf_init(newsp); /* Move any messages in the old socket's receive queue that are for the * peeled off association to the new socket's receive queue. */ sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); sctp_skb_set_owner_r_frag(skb, newsk); } } /* Clean up any messages pending delivery due to partial * delivery. Three cases: * 1) No partial deliver; no work. * 2) Peeling off partial delivery; keep pd_lobby in new pd_lobby. * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue. */ atomic_set(&sctp_sk(newsk)->pd_mode, assoc->ulpq.pd_mode); if (atomic_read(&sctp_sk(oldsk)->pd_mode)) { struct sk_buff_head *queue; /* Decide which queue to move pd_lobby skbs to. */ if (assoc->ulpq.pd_mode) { queue = &newsp->pd_lobby; } else queue = &newsk->sk_receive_queue; /* Walk through the pd_lobby, looking for skbs that * need moved to the new socket. */ sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); sctp_skb_set_owner_r_frag(skb, newsk); } } /* Clear up any skbs waiting for the partial * delivery to finish. */ if (assoc->ulpq.pd_mode) sctp_clear_pd(oldsk, NULL); } sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag); /* Set the type of socket to indicate that it is peeled off from the * original UDP-style socket or created with the accept() call on a * TCP-style socket.. */ newsp->type = type; /* Mark the new socket "in-use" by the user so that any packets * that may arrive on the association after we've moved it are * queued to the backlog. This prevents a potential race between * backlog processing on the old socket and new-packet processing * on the new socket. * * The caller has just allocated newsk so we can guarantee that other * paths won't try to lock it and then oldsk. */ lock_sock_nested(newsk, SINGLE_DEPTH_NESTING); sctp_for_each_tx_datachunk(assoc, true, sctp_clear_owner_w); sctp_assoc_migrate(assoc, newsk); sctp_for_each_tx_datachunk(assoc, false, sctp_set_owner_w); /* If the association on the newsk is already closed before accept() * is called, set RCV_SHUTDOWN flag. */ if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) { inet_sk_set_state(newsk, SCTP_SS_CLOSED); newsk->sk_shutdown |= RCV_SHUTDOWN; } else { inet_sk_set_state(newsk, SCTP_SS_ESTABLISHED); } release_sock(newsk); return 0; } /* This proto struct describes the ULP interface for SCTP. */ struct proto sctp_prot = { .name = "SCTP", .owner = THIS_MODULE, .close = sctp_close, .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, .init = sctp_init_sock, .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, .bpf_bypass_getsockopt = sctp_bpf_bypass_getsockopt, .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp_sock), .useroffset = offsetof(struct sctp_sock, subscribe), .usersize = offsetof(struct sctp_sock, initmsg) - offsetof(struct sctp_sock, subscribe) + sizeof_field(struct sctp_sock, initmsg), .sysctl_mem = sysctl_sctp_mem, .sysctl_rmem = sysctl_sctp_rmem, .sysctl_wmem = sysctl_sctp_wmem, .memory_pressure = &sctp_memory_pressure, .enter_memory_pressure = sctp_enter_memory_pressure, .memory_allocated = &sctp_memory_allocated, .sockets_allocated = &sctp_sockets_allocated, }; #if IS_ENABLED(CONFIG_IPV6) static void sctp_v6_destruct_sock(struct sock *sk) { sctp_destruct_common(sk); inet6_sock_destruct(sk); } static int sctp_v6_init_sock(struct sock *sk) { int ret = sctp_init_sock(sk); if (!ret) sk->sk_destruct = sctp_v6_destruct_sock; return ret; } struct proto sctpv6_prot = { .name = "SCTPv6", .owner = THIS_MODULE, .close = sctp_close, .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, .init = sctp_v6_init_sock, .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, .bpf_bypass_getsockopt = sctp_bpf_bypass_getsockopt, .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - offsetof(struct sctp6_sock, sctp.subscribe) + sizeof_field(struct sctp6_sock, sctp.initmsg), .sysctl_mem = sysctl_sctp_mem, .sysctl_rmem = sysctl_sctp_rmem, .sysctl_wmem = sysctl_sctp_wmem, .memory_pressure = &sctp_memory_pressure, .enter_memory_pressure = sctp_enter_memory_pressure, .memory_allocated = &sctp_memory_allocated, .sockets_allocated = &sctp_sockets_allocated, }; #endif /* IS_ENABLED(CONFIG_IPV6) */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 // SPDX-License-Identifier: GPL-2.0 /* Watch queue and general notification mechanism, built on pipes * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/watch_queue.rst */ #define pr_fmt(fmt) "watchq: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/miscdevice.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/poll.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/sched/signal.h> #include <linux/watch_queue.h> #include <linux/pipe_fs_i.h> MODULE_DESCRIPTION("Watch queue"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) /* * This must be called under the RCU read-lock, which makes * sure that the wqueue still exists. It can then take the lock, * and check that the wqueue hasn't been destroyed, which in * turn makes sure that the notification pipe still exists. */ static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); if (unlikely(wqueue->defunct)) { spin_unlock_bh(&wqueue->lock); return false; } return true; } static inline void unlock_wqueue(struct watch_queue *wqueue) { spin_unlock_bh(&wqueue->lock); } static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct watch_queue *wqueue = (struct watch_queue *)buf->private; struct page *page; unsigned int bit; /* We need to work out which note within the page this refers to, but * the note might have been maximum size, so merely ANDing the offset * off doesn't work. OTOH, the note must've been more than zero size. */ bit = buf->offset + buf->len; if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) bit -= WATCH_QUEUE_NOTE_SIZE; bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; bit += page->index; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing #define watch_queue_pipe_buf_try_steal NULL /* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { .release = watch_queue_pipe_buf_release, .try_steal = watch_queue_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* * Post a notification to a watch queue. * * Must be called with the RCU lock for reading, and the * watch_queue lock held, which guarantees that the pipe * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) { void *p; struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; unsigned int head, tail, mask, note, offset, len; bool done = false; if (!pipe) return false; spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) goto lost; note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); if (note >= wqueue->nr_notes) goto lost; page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; p = kmap_atomic(page); memcpy(p + offset, n, len); kunmap_atomic(p); buf = &pipe->bufs[head & mask]; buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); BUG(); } wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); done = true; out: spin_unlock_irq(&pipe->rd_wait.lock); if (done) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); return done; lost: buf = &pipe->bufs[(head - 1) & mask]; buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } /* * Apply filter rules to a notification. */ static bool filter_watch_notification(const struct watch_filter *wf, const struct watch_notification *n) { const struct watch_type_filter *wt; unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; unsigned int st_index = n->subtype / st_bits; unsigned int st_bit = 1U << (n->subtype % st_bits); int i; if (!test_bit(n->type, wf->type_filter)) return false; for (i = 0; i < wf->nr_filters; i++) { wt = &wf->filters[i]; if (n->type == wt->type && (wt->subtype_filter[st_index] & st_bit) && (n->info & wt->info_mask) == wt->info_filter) return true; } return false; /* If there is a filter, the default is to reject. */ } /** * __post_watch_notification - Post an event notification * @wlist: The watch list to post the event to. * @n: The notification record to post. * @cred: The creds of the process that triggered the notification. * @id: The ID to match on the watch. * * Post a notification of an event into a set of watch queues and let the users * know. * * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and * should be in units of sizeof(*n). */ void __post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { const struct watch_filter *wf; struct watch_queue *wqueue; struct watch *watch; if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { WARN_ON(1); return; } rcu_read_lock(); hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { if (watch->id != id) continue; n->info &= ~WATCH_INFO_ID; n->info |= watch->info_id; wqueue = rcu_dereference(watch->queue); wf = rcu_dereference(wqueue->filter); if (wf && !filter_watch_notification(wf, n)) continue; if (security_post_notification(watch->cred, cred, n) < 0) continue; if (lock_wqueue(wqueue)) { post_one_notification(wqueue, n); unlock_wqueue(wqueue); } } rcu_read_unlock(); } EXPORT_SYMBOL(__post_watch_notification); /* * Allocate sufficient pages to preallocation for the requested number of * notifications. */ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) { struct watch_queue *wqueue = pipe->watch_queue; struct page **pages; unsigned long *bitmap; unsigned long user_bufs; unsigned int bmsize; int ret, i, nr_pages; if (!wqueue) return -ENODEV; if (wqueue->notes) return -EBUSY; if (nr_notes < 1 || nr_notes > 512) /* TODO: choose a better hard limit */ return -EINVAL; nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); if (nr_pages > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto error; } nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; ret = -ENOMEM; pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); if (!pages) goto error; for (i = 0; i < nr_pages; i++) { pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; } bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; bmsize *= sizeof(unsigned long); bitmap = kmalloc(bmsize, GFP_KERNEL); if (!bitmap) goto error_p; memset(bitmap, 0xff, bmsize); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; wqueue->nr_notes = nr_notes; return 0; error_p: while (--i >= 0) __free_page(pages[i]); kfree(pages); error: (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); return ret; } /* * Set the filter on a watch queue. */ long watch_queue_set_filter(struct pipe_inode_info *pipe, struct watch_notification_filter __user *_filter) { struct watch_notification_type_filter *tf; struct watch_notification_filter filter; struct watch_type_filter *q; struct watch_filter *wfilter; struct watch_queue *wqueue = pipe->watch_queue; int ret, nr_filter = 0, i; if (!wqueue) return -ENODEV; if (!_filter) { /* Remove the old filter */ wfilter = NULL; goto set; } /* Grab the user's filter specification */ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) return -EFAULT; if (filter.nr_filters == 0 || filter.nr_filters > 16 || filter.__reserved != 0) return -EINVAL; tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf)); if (IS_ERR(tf)) return PTR_ERR(tf); ret = -EINVAL; for (i = 0; i < filter.nr_filters; i++) { if ((tf[i].info_filter & ~tf[i].info_mask) || tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } /* Now we need to build the internal filter from only the relevant * user-specified filters. */ ret = -ENOMEM; wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); if (!wfilter) goto err_filter; wfilter->nr_filters = nr_filter; q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; q->info_filter = tf[i].info_filter; q->info_mask = tf[i].info_mask; q->subtype_filter[0] = tf[i].subtype_filter[0]; __set_bit(q->type, wfilter->type_filter); q++; } kfree(tf); set: pipe_lock(pipe); wfilter = rcu_replace_pointer(wqueue->filter, wfilter, lockdep_is_held(&pipe->mutex)); pipe_unlock(pipe); if (wfilter) kfree_rcu(wfilter, rcu); return 0; err_filter: kfree(tf); return ret; } static void __put_watch_queue(struct kref *kref) { struct watch_queue *wqueue = container_of(kref, struct watch_queue, usage); struct watch_filter *wfilter; int i; for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); kfree(wqueue->notes); bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) kfree_rcu(wfilter, rcu); kfree_rcu(wqueue, rcu); } /** * put_watch_queue - Dispose of a ref on a watchqueue. * @wqueue: The watch queue to unref. */ void put_watch_queue(struct watch_queue *wqueue) { kref_put(&wqueue->usage, __put_watch_queue); } EXPORT_SYMBOL(put_watch_queue); static void free_watch(struct rcu_head *rcu) { struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); kfree(watch); } static void __put_watch(struct kref *kref) { struct watch *watch = container_of(kref, struct watch, usage); call_rcu(&watch->rcu, free_watch); } /* * Discard a watch. */ static void put_watch(struct watch *watch) { kref_put(&watch->usage, __put_watch); } /** * init_watch - Initialise a watch * @watch: The watch to initialise. * @wqueue: The queue to assign. * * Initialise a watch and set the watch queue. */ void init_watch(struct watch *watch, struct watch_queue *wqueue) { kref_init(&watch->usage); INIT_HLIST_NODE(&watch->list_node); INIT_HLIST_NODE(&watch->queue_node); rcu_assign_pointer(watch->queue, wqueue); } static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue) { const struct cred *cred; struct watch *w; hlist_for_each_entry(w, &wlist->watchers, list_node) { struct watch_queue *wq = rcu_access_pointer(w->queue); if (wqueue == wq && watch->id == w->id) return -EBUSY; } cred = current_cred(); if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) { atomic_dec(&cred->user->nr_watches); return -EAGAIN; } watch->cred = get_cred(cred); rcu_assign_pointer(watch->watch_list, wlist); kref_get(&wqueue->usage); kref_get(&watch->usage); hlist_add_head(&watch->queue_node, &wqueue->watches); hlist_add_head_rcu(&watch->list_node, &wlist->watchers); return 0; } /** * add_watch_to_object - Add a watch on an object to a watch list * @watch: The watch to add * @wlist: The watch list to add to * * @watch->queue must have been set to point to the queue to post notifications * to and the watch list of the object to be watched. @watch->cred must also * have been set to the appropriate credentials and a ref taken on them. * * The caller must pin the queue and the list both and must hold the list * locked against racing watch additions/removals. */ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) { struct watch_queue *wqueue; int ret = -ENOENT; rcu_read_lock(); wqueue = rcu_access_pointer(watch->queue); if (lock_wqueue(wqueue)) { spin_lock(&wlist->lock); ret = add_one_watch(watch, wlist, wqueue); spin_unlock(&wlist->lock); unlock_wqueue(wqueue); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(add_watch_to_object); /** * remove_watch_from_object - Remove a watch or all watches from an object. * @wlist: The watch list to remove from * @wq: The watch queue of interest (ignored if @all is true) * @id: The ID of the watch to remove (ignored if @all is true) * @all: True to remove all objects * * Remove a specific watch or all watches from an object. A notification is * sent to the watcher to tell them that this happened. */ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, u64 id, bool all) { struct watch_notification_removal n; struct watch_queue *wqueue; struct watch *watch; int ret = -EBADSLT; rcu_read_lock(); again: spin_lock(&wlist->lock); hlist_for_each_entry(watch, &wlist->watchers, list_node) { if (all || (watch->id == id && rcu_access_pointer(watch->queue) == wq)) goto found; } spin_unlock(&wlist->lock); goto out; found: ret = 0; hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); spin_unlock(&wlist->lock); /* We now own the reference on watch that used to belong to wlist. */ n.watch.type = WATCH_TYPE_META; n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; n.watch.info = watch->info_id | watch_sizeof(n.watch); n.id = id; if (id != 0) n.watch.info = watch->info_id | watch_sizeof(n); wqueue = rcu_dereference(watch->queue); if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } unlock_wqueue(wqueue); } if (wlist->release_watch) { void (*release_watch)(struct watch *); release_watch = wlist->release_watch; rcu_read_unlock(); (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); if (all && !hlist_empty(&wlist->watchers)) goto again; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(remove_watch_from_object); /* * Remove all the watches that are contributory to a queue. This has the * potential to race with removal of the watches by the destruction of the * objects being watched or with the distribution of notifications. */ void watch_queue_clear(struct watch_queue *wqueue) { struct watch_list *wlist; struct watch *watch; bool release; rcu_read_lock(); spin_lock_bh(&wqueue->lock); /* Prevent new notifications from being stored. */ wqueue->defunct = true; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); hlist_del_init_rcu(&watch->queue_node); /* We now own a ref on the watch. */ spin_unlock_bh(&wqueue->lock); /* We can't do the next bit under the queue lock as we need to * get the list lock - which would cause a deadlock if someone * was removing from the opposite direction at the same time or * posting a notification. */ wlist = rcu_dereference(watch->watch_list); if (wlist) { void (*release_watch)(struct watch *); spin_lock(&wlist->lock); release = !hlist_unhashed(&watch->list_node); if (release) { hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); /* We now own a second ref on the watch. */ } release_watch = wlist->release_watch; spin_unlock(&wlist->lock); if (release) { if (release_watch) { rcu_read_unlock(); /* This might need to call dput(), so * we have to drop all the locks. */ (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); } } put_watch(watch); spin_lock_bh(&wqueue->lock); } spin_unlock_bh(&wqueue->lock); rcu_read_unlock(); } /** * get_watch_queue - Get a watch queue from its file descriptor. * @fd: The fd to query. */ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); struct fd f; f = fdget(fd); if (f.file) { pipe = get_pipe_info(f.file, false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } fdput(f); } return wqueue; } EXPORT_SYMBOL(get_watch_queue); /* * Initialise a watch queue */ int watch_queue_init(struct pipe_inode_info *pipe) { struct watch_queue *wqueue; wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); if (!wqueue) return -ENOMEM; wqueue->pipe = pipe; kref_init(&wqueue->usage); spin_lock_init(&wqueue->lock); INIT_HLIST_HEAD(&wqueue->watches); pipe->watch_queue = wqueue; return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0-only /* * Context tracking: Probe on high level context boundaries such as kernel * and userspace. This includes syscalls and exceptions entry/exit. * * This is used by RCU to remove its dependency on the timer tick while a CPU * runs in userspace. * * Started by Frederic Weisbecker: * * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> * * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, * Steven Rostedt, Peter Zijlstra for suggestions and improvements. * */ #include <linux/context_tracking.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/hardirq.h> #include <linux/export.h> #include <linux/kprobes.h> #define CREATE_TRACE_POINTS #include <trace/events/context_tracking.h> DEFINE_STATIC_KEY_FALSE(context_tracking_key); EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); static noinstr bool context_tracking_recursion_enter(void) { int recursion; recursion = __this_cpu_inc_return(context_tracking.recursion); if (recursion == 1) return true; WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion); __this_cpu_dec(context_tracking.recursion); return false; } static __always_inline void context_tracking_recursion_exit(void) { __this_cpu_dec(context_tracking.recursion); } /** * context_tracking_enter - Inform the context tracking that the CPU is going * enter user or guest space mode. * * This function must be called right before we switch from the kernel * to user or guest space, when it's guaranteed the remaining kernel * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ void noinstr __context_tracking_enter(enum ctx_state state) { /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); if (!context_tracking_recursion_enter()) return; if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { /* * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be * any RCU read-side critical section until the next call to * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency * on the tick. */ if (state == CONTEXT_USER) { instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); instrumentation_end(); } rcu_user_enter(); } /* * Even if context tracking is disabled on this CPU, because it's outside * the full dynticks mask for example, we still have to keep track of the * context transitions and states to prevent inconsistency on those of * other CPUs. * If a task triggers an exception in userspace, sleep on the exception * handler and then migrate to another CPU, that new CPU must know where * the exception returns by the time we call exception_exit(). * This information can only be provided by the previous CPU when it called * exception_enter(). * OTOH we can spare the calls to vtime and RCU when context_tracking.active * is false because we know that CPU is not tickless. */ __this_cpu_write(context_tracking.state, state); } context_tracking_recursion_exit(); } EXPORT_SYMBOL_GPL(__context_tracking_enter); void context_tracking_enter(enum ctx_state state) { unsigned long flags; /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() * This would mess up the dyntick_nesting count though. And rcu_irq_*() * helpers are enough to protect RCU uses inside the exception. So * just return immediately if we detect we are in an IRQ. */ if (in_interrupt()) return; local_irq_save(flags); __context_tracking_enter(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); EXPORT_SYMBOL_GPL(context_tracking_enter); void context_tracking_user_enter(void) { user_enter(); } NOKPROBE_SYMBOL(context_tracking_user_enter); /** * context_tracking_exit - Inform the context tracking that the CPU is * exiting user or guest mode and entering the kernel. * * This function must be called after we entered the kernel from user or * guest space before any use of RCU read side critical section. This * potentially include any high level kernel code like syscalls, exceptions, * signal handling, etc... * * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ void noinstr __context_tracking_exit(enum ctx_state state) { if (!context_tracking_recursion_enter()) return; if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* * We are going to run code that may use RCU. Inform * RCU core about that (ie: we may need the tick again). */ rcu_user_exit(); if (state == CONTEXT_USER) { instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); instrumentation_end(); } } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } EXPORT_SYMBOL_GPL(__context_tracking_exit); void context_tracking_exit(enum ctx_state state) { unsigned long flags; if (in_interrupt()) return; local_irq_save(flags); __context_tracking_exit(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); EXPORT_SYMBOL_GPL(context_tracking_exit); void context_tracking_user_exit(void) { user_exit(); } NOKPROBE_SYMBOL(context_tracking_user_exit); void __init context_tracking_cpu_set(int cpu) { static __initdata bool initialized = false; if (!per_cpu(context_tracking.active, cpu)) { per_cpu(context_tracking.active, cpu) = true; static_branch_inc(&context_tracking_key); } if (initialized) return; #ifdef CONFIG_HAVE_TIF_NOHZ /* * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork * This assumes that init is the only task at this early boot stage. */ set_tsk_thread_flag(&init_task, TIF_NOHZ); #endif WARN_ON_ONCE(!tasklist_empty()); initialized = true; } #ifdef CONFIG_CONTEXT_TRACKING_FORCE void __init context_tracking_init(void) { int cpu; for_each_possible_cpu(cpu) context_tracking_cpu_set(cpu); } #endif
801 27 26 1109 855 255 826 1003 1007 699 699 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/lockref.h> #if USE_CMPXCHG_LOCKREF /* * Note that the "cmpxchg()" reloads the "old" value for the * failure case. */ #define CMPXCHG_LOOP(CODE, SUCCESS) do { \ int retry = 100; \ struct lockref old; \ BUILD_BUG_ON(sizeof(old) != 8); \ old.lock_count = READ_ONCE(lockref->lock_count); \ while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \ struct lockref new = old, prev = old; \ CODE \ old.lock_count = cmpxchg64_relaxed(&lockref->lock_count, \ old.lock_count, \ new.lock_count); \ if (likely(old.lock_count == prev.lock_count)) { \ SUCCESS; \ } \ if (!--retry) \ break; \ } \ } while (0) #else #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0) #endif /** * lockref_get - Increments reference count unconditionally * @lockref: pointer to lockref structure * * This operation is only valid if you already hold a reference * to the object, so you know the count cannot be zero. */ void lockref_get(struct lockref *lockref) { CMPXCHG_LOOP( new.count++; , return; ); spin_lock(&lockref->lock); lockref->count++; spin_unlock(&lockref->lock); } EXPORT_SYMBOL(lockref_get); /** * lockref_get_not_zero - Increments count unless the count is 0 or dead * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count was zero */ int lockref_get_not_zero(struct lockref *lockref) { int retval; CMPXCHG_LOOP( new.count++; if (old.count <= 0) return 0; , return 1; ); spin_lock(&lockref->lock); retval = 0; if (lockref->count > 0) { lockref->count++; retval = 1; } spin_unlock(&lockref->lock); return retval; } EXPORT_SYMBOL(lockref_get_not_zero); /** * lockref_put_not_zero - Decrements count unless count <= 1 before decrement * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count would become zero */ int lockref_put_not_zero(struct lockref *lockref) { int retval; CMPXCHG_LOOP( new.count--; if (old.count <= 1) return 0; , return 1; ); spin_lock(&lockref->lock); retval = 0; if (lockref->count > 1) { lockref->count--; retval = 1; } spin_unlock(&lockref->lock); return retval; } EXPORT_SYMBOL(lockref_put_not_zero); /** * lockref_get_or_lock - Increments count unless the count is 0 or dead * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count was zero * and we got the lock instead. */ int lockref_get_or_lock(struct lockref *lockref) { CMPXCHG_LOOP( new.count++; if (old.count <= 0) break; , return 1; ); spin_lock(&lockref->lock); if (lockref->count <= 0) return 0; lockref->count++; spin_unlock(&lockref->lock); return 1; } EXPORT_SYMBOL(lockref_get_or_lock); /** * lockref_put_return - Decrement reference count if possible * @lockref: pointer to lockref structure * * Decrement the reference count and return the new value. * If the lockref was dead or locked, return an error. */ int lockref_put_return(struct lockref *lockref) { CMPXCHG_LOOP( new.count--; if (old.count <= 0) return -1; , return new.count; ); return -1; } EXPORT_SYMBOL(lockref_put_return); /** * lockref_put_or_lock - decrements count unless count <= 1 before decrement * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken */ int lockref_put_or_lock(struct lockref *lockref) { CMPXCHG_LOOP( new.count--; if (old.count <= 1) break; , return 1; ); spin_lock(&lockref->lock); if (lockref->count <= 1) return 0; lockref->count--; spin_unlock(&lockref->lock); return 1; } EXPORT_SYMBOL(lockref_put_or_lock); /** * lockref_mark_dead - mark lockref dead * @lockref: pointer to lockref structure */ void lockref_mark_dead(struct lockref *lockref) { assert_spin_locked(&lockref->lock); lockref->count = -128; } EXPORT_SYMBOL(lockref_mark_dead); /** * lockref_get_not_dead - Increments count unless the ref is dead * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if lockref was dead */ int lockref_get_not_dead(struct lockref *lockref) { int retval; CMPXCHG_LOOP( new.count++; if (old.count < 0) return 0; , return 1; ); spin_lock(&lockref->lock); retval = 0; if (lockref->count >= 0) { lockref->count++; retval = 1; } spin_unlock(&lockref->lock); return retval; } EXPORT_SYMBOL(lockref_get_not_dead);
115 117 117 116 117 117 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kzalloc(3 * sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 // SPDX-License-Identifier: GPL-2.0-only /* * cfg80211 debugfs * * Copyright 2009 Luis R. Rodriguez <lrodriguez@atheros.com> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> */ #include <linux/slab.h> #include "core.h" #include "debugfs.h" #define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...) \ static ssize_t name## _read(struct file *file, char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct wiphy *wiphy = file->private_data; \ char buf[buflen]; \ int res; \ \ res = scnprintf(buf, buflen, fmt "\n", ##value); \ return simple_read_from_buffer(userbuf, count, ppos, buf, res); \ } \ \ static const struct file_operations name## _ops = { \ .read = name## _read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", wiphy->rts_threshold); DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", wiphy->frag_threshold); DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", wiphy->retry_short); DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", wiphy->retry_long); static int ht_print_chan(struct ieee80211_channel *chan, char *buf, int buf_size, int offset) { if (WARN_ON(offset > buf_size)) return 0; if (chan->flags & IEEE80211_CHAN_DISABLED) return scnprintf(buf + offset, buf_size - offset, "%d Disabled\n", chan->center_freq); return scnprintf(buf + offset, buf_size - offset, "%d HT40 %c%c\n", chan->center_freq, (chan->flags & IEEE80211_CHAN_NO_HT40MINUS) ? ' ' : '-', (chan->flags & IEEE80211_CHAN_NO_HT40PLUS) ? ' ' : '+'); } static ssize_t ht40allow_map_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct wiphy *wiphy = file->private_data; char *buf; unsigned int offset = 0, buf_size = PAGE_SIZE, i; enum nl80211_band band; struct ieee80211_supported_band *sband; ssize_t r; buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) return -ENOMEM; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) offset += ht_print_chan(&sband->channels[i], buf, buf_size, offset); } r = simple_read_from_buffer(user_buf, count, ppos, buf, offset); kfree(buf); return r; } static const struct file_operations ht40allow_map_ops = { .read = ht40allow_map_read, .open = simple_open, .llseek = default_llseek, }; #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops) void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) { struct dentry *phyd = rdev->wiphy.debugfsdir; DEBUGFS_ADD(rts_threshold); DEBUGFS_ADD(fragmentation_threshold); DEBUGFS_ADD(short_retry_limit); DEBUGFS_ADD(long_retry_limit); DEBUGFS_ADD(ht40allow_map); }
53 54 32 1 40 28 2 26 63 63 59 59 33 44 44 38 38 78 53 74 54 18 1 12 22 45 14 16 45 45 35 32 2 32 32 32 32 31 37 37 13 62 62 25 62 28 28 1 28 3 26 37 18 12 5 28 31 29 3 23 68 28 93 72 52 2 78 62 33 8 63 78 44 59 70 26 31 32 62 6 14 7 11 11 1 1 63 63 80 79 43 38 57 26 80 10 76 76 45 29 29 45 45 3 32 31 32 1 57 1 14 26 65 8 61 21 17 1 5 5 8 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 /* * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "addr.h" #include "group.h" #include "bcast.h" #include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" #include "name_table.h" #include "subscr.h" #define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) #define ADV_IDLE ADV_UNIT #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, MBR_PENDING, MBR_ACTIVE, MBR_RECLAIMING, MBR_REMITTED, MBR_LEAVING }; struct tipc_member { struct rb_node tree_node; struct list_head list; struct list_head small_win; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; u32 port; u32 instance; enum mbr_state state; u16 advertised; u16 window; u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; }; struct tipc_group { struct rb_root members; struct list_head small_win; struct list_head pending; struct list_head active; struct tipc_nlist dests; struct net *net; int subid; u32 type; u32 instance; u32 scope; u32 portid; u16 member_cnt; u16 active_cnt; u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; bool *open; bool loopback; bool events; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); static void tipc_group_open(struct tipc_member *m, bool *wakeup) { *wakeup = false; if (list_empty(&m->small_win)) return; list_del_init(&m->small_win); *m->group->open = true; *wakeup = true; } static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || m->state == MBR_REMITTED) grp->active_cnt--; } static int tipc_group_rcvbuf_limit(struct tipc_group *grp) { int max_active, active_pool, idle_pool; int mcnt = grp->member_cnt + 1; /* Limit simultaneous reception from other members */ max_active = min(mcnt / 8, 64); max_active = max(max_active, 16); grp->max_active = max_active; /* Reserve blocks for active and idle members */ active_pool = max_active * ADV_ACTIVE; idle_pool = (mcnt - max_active) * ADV_IDLE; /* Scale to bytes, considering worst-case truesize/msgsize ratio */ return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; } u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) { return grp->bc_snd_nxt; } static bool tipc_group_is_receiver(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } static bool tipc_group_is_sender(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) { if (!grp->loopback) return grp->portid; return 0; } struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) { u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; grp = kzalloc(sizeof(*grp), GFP_ATOMIC); if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; grp->open = group_is_open; *grp->open = false; filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, filter, &grp->subid)) return grp; kfree(grp); return NULL; } void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); tipc_group_update_member(m, 0); } tipc_node_distr_xmit(net, &xmitq); *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); __skb_queue_purge(&m->deferredq); list_del(&m->list); kfree(m); } tipc_node_distr_xmit(net, &xmitq); tipc_nlist_purge(&grp->dests); tipc_topsrv_kern_unsubscr(net, grp->subid); kfree(grp); } static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; struct tipc_member *m; while (n) { m = container_of(n, struct tipc_member, tree_node); nkey = (u64)m->node << 32 | m->port; if (key < nkey) n = n->rb_left; else if (key > nkey) n = n->rb_right; else return m; } return NULL; } static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, u32 node, u32 port) { struct tipc_member *m; m = tipc_group_find_member(grp, node, port); if (m && tipc_group_is_receiver(m)) return m; return NULL; } static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, u32 node) { struct tipc_member *m; struct rb_node *n; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (m->node == node) return m; } return NULL; } static int tipc_group_add_to_tree(struct tipc_group *grp, struct tipc_member *m) { u64 nkey, key = (u64)m->node << 32 | m->port; struct rb_node **n, *parent = NULL; struct tipc_member *tmp; n = &grp->members.rb_node; while (*n) { tmp = container_of(*n, struct tipc_member, tree_node); parent = *n; tmp = container_of(parent, struct tipc_member, tree_node); nkey = (u64)tmp->node << 32 | tmp->port; if (key < nkey) n = &(*n)->rb_left; else if (key > nkey) n = &(*n)->rb_right; else return -EEXIST; } rb_link_node(&m->tree_node, parent, n); rb_insert_color(&m->tree_node, &grp->members); return 0; } static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, u32 instance, int state) { struct tipc_member *m; int ret; m = kzalloc(sizeof(*m), GFP_ATOMIC); if (!m) return NULL; INIT_LIST_HEAD(&m->list); INIT_LIST_HEAD(&m->small_win); __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; ret = tipc_group_add_to_tree(grp, m); if (ret < 0) { kfree(m); return NULL; } grp->member_cnt++; tipc_nlist_add(&grp->dests, m->node); m->state = state; return m; } void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance) { tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, struct tipc_member *m) { rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; /* Check if we were waiting for replicast ack from this member */ if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) grp->bc_ackers--; list_del_init(&m->list); list_del_init(&m->small_win); tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) tipc_nlist_del(&grp->dests, m->node); kfree(m); } struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) { return &grp->dests; } void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; seq->lower = grp->instance; seq->upper = grp->instance; *scope = grp->scope; } void tipc_group_update_member(struct tipc_member *m, int len) { struct tipc_group *grp = m->group; struct tipc_member *_m, *tmp; if (!tipc_group_is_receiver(m)) return; m->window -= len; if (m->window >= ADV_IDLE) return; list_del_init(&m->small_win); /* Sort member into small_window members' list */ list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) { if (_m->window > m->window) break; } list_add_tail(&m->small_win, &_m->small_win); } void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) { u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_receiver(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) grp->bc_ackers = ackers; grp->bc_snd_nxt++; } bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **mbr) { struct sk_buff_head xmitq; struct tipc_member *m; int adv, state; m = tipc_group_find_dest(grp, dnode, dport); if (!tipc_group_is_receiver(m)) { *mbr = NULL; return false; } *mbr = m; if (m->window >= len) return false; *grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) return true; if (state == MBR_PENDING && adv == ADV_IDLE) return true; __skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); return true; } bool tipc_group_bc_cong(struct tipc_group *grp, int len) { struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ if (grp->bc_ackers) { *grp->open = false; return true; } if (list_empty(&grp->small_win)) return false; m = list_first_entry(&grp->small_win, struct tipc_member, small_win); if (m->window >= len) return false; return tipc_group_cong(grp, m->node, m->port, len, &m); } /* tipc_group_sort_msg() - sort msg into queue by bcast sequence number */ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) { struct tipc_msg *_hdr, *hdr = buf_msg(skb); u16 bc_seqno = msg_grp_bc_seqno(hdr); struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) continue; __skb_queue_before(defq, _skb, skb); return; } /* Bcast was not bypassed, - add to tail */ } /* Unicasts are never bypassed, - always add to tail */ __skb_queue_tail(defq, skb); } /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); bool ack, deliver, update, leave = false; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; u32 node, port; int mtyp, blks; if (!skb) return; hdr = buf_msg(skb); node = msg_orignode(hdr); port = msg_origport(hdr); if (!msg_in_group(hdr)) goto drop; m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_sender(m)) goto drop; if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) goto drop; TIPC_SKB_CB(skb)->orig_member = m->instance; defq = &m->deferredq; tipc_group_sort_msg(skb, defq); while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); blks = msg_blocks(hdr); deliver = true; ack = false; update = false; if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) break; /* Decide what to do with message */ switch (mtyp) { case TIPC_GRP_MCAST_MSG: if (msg_nameinst(hdr) != grp->instance) { update = true; deliver = false; } fallthrough; case TIPC_GRP_BCAST_MSG: m->bc_rcv_nxt++; ack = msg_grp_bc_ack_req(hdr); break; case TIPC_GRP_UCAST_MSG: break; case TIPC_GRP_MEMBER_EVT: if (m->state == MBR_LEAVING) leave = true; if (!grp->events) deliver = false; break; default: break; } /* Execute decisions */ __skb_dequeue(defq); if (deliver) __skb_queue_tail(inputq, skb); else kfree_skb(skb); if (ack) tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { __skb_queue_purge(defq); tipc_group_delete_member(grp, m); break; } if (!update) continue; tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; drop: kfree_skb(skb); } void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq) { struct list_head *active = &grp->active; int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) return; m->advertised -= blks; switch (m->state) { case MBR_JOINED: /* First, decide if member can go active */ if (active_cnt <= max_active) { m->state = MBR_ACTIVE; list_add_tail(&m->list, active); grp->active_cnt++; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } else { m->state = MBR_PENDING; list_add_tail(&m->list, &grp->pending); } if (active_cnt < reclaim_limit) break; /* Reclaim from oldest active member, if possible */ if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); break; } /* Nobody to reclaim from; - revert oldest pending to JOINED */ pm = list_first_entry(&grp->pending, struct tipc_member, list); list_del_init(&pm->list); pm->state = MBR_JOINED; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); if (m->advertised > (ADV_ACTIVE * 3 / 4)) break; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); break; case MBR_REMITTED: if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } if (list_empty(&grp->pending)) return; /* Set oldest pending member to active and advertise */ pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_JOINING: case MBR_LEAVING: default: break; } } static void tipc_group_create_event(struct tipc_group *grp, struct tipc_member *m, u32 event, u16 seqno, struct sk_buff_head *inputq) { u32 dnode = tipc_own_addr(grp->net); struct tipc_event evt; struct sk_buff *skb; struct tipc_msg *hdr; memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; evt.port.ref = m->port; evt.port.node = m->node; evt.s.seq.type = grp->type; evt.s.seq.lower = m->instance; evt.s.seq.upper = m->instance; skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, GROUP_H_SIZE, sizeof(evt), dnode, m->node, grp->portid, m->port, 0); if (!skb) return; hdr = buf_msg(skb); msg_set_nametype(hdr, grp->type); msg_set_grp_evt(hdr, event); msg_set_dest_droppable(hdr, true); msg_set_grp_bc_seqno(hdr, seqno); memcpy(msg_data(hdr), &evt, sizeof(evt)); TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); } static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { struct tipc_msg *hdr; struct sk_buff *skb; int adv = 0; skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, m->node, tipc_own_addr(grp->net), m->port, grp->portid, 0); if (!skb) return; if (m->state == MBR_ACTIVE) adv = ADV_ACTIVE - m->advertised; else if (m->state == MBR_JOINED || m->state == MBR_PENDING) adv = ADV_IDLE - m->advertised; hdr = buf_msg(skb); if (mtyp == GRP_JOIN_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_LEAVE_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_ACK_MSG) { msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; u16 remitted, in_flight; if (!grp) return; if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) return; m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received if necessary */ if (m->state != MBR_PUBLISHED) return; /* Member can be taken into service */ m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); list_del_init(&m->list); tipc_group_open(m, usr_wakeup); tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) return; m->window += msg_adv_win(hdr); tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) return; list_del_init(&m->small_win); *m->group->open = true; *usr_wakeup = true; tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) return; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; m->advertised = ADV_IDLE + in_flight; return; } /* This should never happen */ if (m->advertised < remitted) pr_warn_ratelimited("Unexpected REMIT msg\n"); /* All messages preceding the REMIT have been read */ m->state = MBR_JOINED; grp->active_cnt--; m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) return; pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } } /* tipc_group_member_evt() - receive and handle a member up/down event */ void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; u32 port = evt->port.ref; int event = evt->event; struct tipc_member *m; struct net *net; u32 self; if (!grp) return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) return; m = tipc_group_find_member(grp, node, port); switch (event) { case TIPC_PUBLISHED: /* Send and wait for arrival of JOIN message if necessary */ if (!m) { m = tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); if (!m) break; tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); break; } if (m->state != MBR_JOINING) break; /* Member can be taken into service */ m->instance = instance; m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); break; case TIPC_WITHDRAWN: if (!m) break; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; list_del_init(&m->list); tipc_group_open(m, usr_wakeup); /* Only send event if no LEAVE message can be expected */ if (!tipc_node_is_up(net, node)) tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_rcv_nxt, inputq); break; default: break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) { struct nlattr *group = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_GROUP); if (!group) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, grp->type) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, grp->instance) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, grp->bc_snd_nxt)) goto group_msg_cancel; if (grp->scope == TIPC_NODE_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) goto group_msg_cancel; if (grp->scope == TIPC_CLUSTER_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) goto group_msg_cancel; if (*grp->open) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) goto group_msg_cancel; nla_nest_end(skb, group); return 0; group_msg_cancel: nla_nest_cancel(skb, group); return -1; }
235 236 236 3087 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MMU_CONTEXT_H #define _ASM_X86_MMU_CONTEXT_H #include <asm/desc.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/pkeys.h> #include <trace/events/tlb.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/debugreg.h> extern atomic64_t last_mm_ctx_id; #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { } #endif /* !CONFIG_PARAVIRT_XXL */ #ifdef CONFIG_PERF_EVENTS DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); void cr4_update_pce(void *ignored); #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL /* * ldt_structs can be allocated, used, and freed, but they are never * modified while live. */ struct ldt_struct { /* * Xen requires page-aligned LDTs with special permissions. This is * needed to prevent us from installing evil descriptors such as * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; unsigned int nr_entries; /* * If PTI is in use, then the entries array is not mapped while we're * in user mode. The whole array will be aliased at the addressed * given by ldt_slot_va(slot). We use two slots so that we can allocate * and map, and enable a new LDT without invalidating the mapping * of an older, still-in-use LDT. * * slot will be -1 if this LDT doesn't have an alias mapping. */ int slot; }; /* * Used for LDT copy/destruction. */ static inline void init_new_context_ldt(struct mm_struct *mm) { mm->context.ldt = NULL; init_rwsem(&mm->context.ldt_usr_sem); } int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ static inline void init_new_context_ldt(struct mm_struct *mm) { } static inline int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm) { return 0; } static inline void destroy_context_ldt(struct mm_struct *mm) { } static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL extern void load_mm_ldt(struct mm_struct *mm); extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next); #else static inline void load_mm_ldt(struct mm_struct *mm) { clear_LDT(); } static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) { DEBUG_LOCKS_WARN_ON(preemptible()); } #endif #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). */ #define init_new_context init_new_context static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { mutex_init(&mm->context.lock); mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); mm->context.next_trim_cpumask = jiffies + HZ; #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and allocated implicitly */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; } #endif init_new_context_ldt(mm); return 0; } #define destroy_context destroy_context static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); } extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); #define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ paravirt_activate_mm((prev), (next)); \ switch_mm((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ lazy_load_gs(0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ do { \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) #endif static inline void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* Duplicate the oldmm pkey state in mm: */ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; #endif } static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); paravirt_arch_dup_mmap(oldmm, mm); return ldt_dup_context(oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || !(mm->context.flags & MM_CONTEXT_UPROBE_IA32); } #else static inline bool is_64bit_mm(struct mm_struct *mm) { return false; } #endif static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other * processes or any way to tell *which * PKRU in a threaded * process we could use. * * So do not enforce things if the VMA is not from the current * mm, or if we are in a kernel thread. */ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { /* pkeys never affect instruction fetches */ if (execute) return true; /* allow access if the VMA is not one from this process */ if (foreign || vma_is_foreign(vma)) return true; return __pkru_allows_pkey(vma_pkey(vma), write); } unsigned long __get_current_cr3_fast(void); #include <asm-generic/mmu_context.h> #endif /* _ASM_X86_MMU_CONTEXT_H */
41 41 41 41 41 41 41 41 40 34 34 8 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2018-2021 Intel Corporation */ #ifndef IEEE80211_I_H #define IEEE80211_I_H #include <linux/kernel.h> #include <linux/device.h> #include <linux/if_ether.h> #include <linux/interrupt.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/workqueue.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/etherdevice.h> #include <linux/leds.h> #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/rbtree.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include <net/fq.h> #include "key.h" #include "sta_info.h" #include "debug.h" extern const struct cfg80211_ops mac80211_config_ops; struct ieee80211_local; /* Maximum number of broadcast/multicast frames to buffer when some of the * associated stations are using power saving. */ #define AP_MAX_BC_BUFFER 128 /* Maximum number of frames buffered to all STAs, including multicast frames. * Note: increasing this limit increases the potential memory requirement. Each * frame can be up to about 2 kB long. */ #define TOTAL_MAX_TX_BUFFER 512 /* Required encryption head and tailroom */ #define IEEE80211_ENCRYPT_HEADROOM 8 #define IEEE80211_ENCRYPT_TAILROOM 18 /* power level hasn't been configured (or set to automatic) */ #define IEEE80211_UNSET_POWER_LEVEL INT_MIN /* * Some APs experience problems when working with U-APSD. Decreasing the * probability of that happening by using legacy mode for all ACs but VO isn't * enough. * * Cisco 4410N originally forced us to enable VO by default only because it * treated non-VO ACs as legacy. * * However some APs (notably Netgear R7000) silently reclassify packets to * different ACs. Since u-APSD ACs require trigger frames for frame retrieval * clients would never see some frames (e.g. ARP responses) or would fetch them * accidentally after a long time. * * It makes little sense to enable u-APSD queues by default because it needs * userspace applications to be aware of it to actually take advantage of the * possible additional powersavings. Implicitly depending on driver autotrigger * frame support doesn't make much sense. */ #define IEEE80211_DEFAULT_UAPSD_QUEUES 0 #define IEEE80211_DEFAULT_MAX_SP_LEN \ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) #define IEEE80211_MAX_NAN_INSTANCE_ID 255 struct ieee80211_bss { u32 device_ts_beacon, device_ts_presp; bool wmm_used; bool uapsd_supported; #define IEEE80211_MAX_SUPP_RATES 32 u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; size_t supp_rates_len; struct ieee80211_rate *beacon_rate; u32 vht_cap_info; /* * During association, we save an ERP value from a probe response so * that we can feed ERP info to the driver when handling the * association completes. these fields probably won't be up-to-date * otherwise, you probably don't want to use them. */ bool has_erp_value; u8 erp_value; /* Keep track of the corruption of the last beacon/probe response. */ u8 corrupt_data; /* Keep track of what bits of information we have valid info for. */ u8 valid_data; }; /** * enum ieee80211_bss_corrupt_data_flags - BSS data corruption flags * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted * * These are bss flags that are attached to a bss in the * @corrupt_data field of &struct ieee80211_bss. */ enum ieee80211_bss_corrupt_data_flags { IEEE80211_BSS_CORRUPT_BEACON = BIT(0), IEEE80211_BSS_CORRUPT_PROBE_RESP = BIT(1) }; /** * enum ieee80211_bss_valid_data_flags - BSS valid data flags * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE * * These are bss flags that are attached to a bss in the * @valid_data field of &struct ieee80211_bss. They show which parts * of the data structure were received as a result of an un-corrupted * beacon/probe response. */ enum ieee80211_bss_valid_data_flags { IEEE80211_BSS_VALID_WMM = BIT(1), IEEE80211_BSS_VALID_RATES = BIT(2), IEEE80211_BSS_VALID_ERP = BIT(3) }; typedef unsigned __bitwise ieee80211_tx_result; #define TX_CONTINUE ((__force ieee80211_tx_result) 0u) #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) struct ieee80211_tx_data { struct sk_buff *skb; struct sk_buff_head skbs; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_key *key; struct ieee80211_tx_rate rate; unsigned int flags; }; typedef unsigned __bitwise ieee80211_rx_result; #define RX_CONTINUE ((__force ieee80211_rx_result) 0u) #define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u) #define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u) #define RX_QUEUED ((__force ieee80211_rx_result) 3u) /** * enum ieee80211_packet_rx_flags - packet RX flags * @IEEE80211_RX_AMSDU: a-MSDU packet * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering * * These are per-frame flags that are attached to a frame in the * @rx_flags field of &struct ieee80211_rx_status. */ enum ieee80211_packet_rx_flags { IEEE80211_RX_AMSDU = BIT(3), IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), IEEE80211_RX_DEFERRED_RELEASE = BIT(5), }; /** * enum ieee80211_rx_flags - RX data flags * * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). * * These flags are used across handling multiple interfaces * for a single frame. */ enum ieee80211_rx_flags { IEEE80211_RX_CMNTR = BIT(0), IEEE80211_RX_BEACON_REPORTED = BIT(1), }; struct ieee80211_rx_data { struct list_head *list; struct sk_buff *skb; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_key *key; unsigned int flags; /* * Index into sequence numbers array, 0..16 * since the last (16) is used for non-QoS, * will be 16 on non-QoS frames. */ int seqno_idx; /* * Index into the security IV/PN arrays, 0..16 * since the last (16) is used for CCMP-encrypted * management frames, will be set to 16 on mgmt * frames and 0 on non-QoS frames. */ int security_idx; union { struct { u32 iv32; u16 iv16; } tkip; struct { u8 pn[IEEE80211_CCMP_PN_LEN]; } ccm_gcm; }; }; struct ieee80211_csa_settings { const u16 *counter_offsets_beacon; const u16 *counter_offsets_presp; int n_counter_offsets_beacon; int n_counter_offsets_presp; u8 count; }; struct ieee80211_color_change_settings { u16 counter_offset_beacon; u16 counter_offset_presp; u8 count; }; struct beacon_data { u8 *head, *tail; int head_len, tail_len; struct ieee80211_meshconf_ie *meshconf; u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u8 cntdwn_current_counter; struct rcu_head rcu_head; }; struct probe_resp { struct rcu_head rcu_head; int len; u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u8 data[]; }; struct fils_discovery_data { struct rcu_head rcu_head; int len; u8 data[]; }; struct unsol_bcast_probe_resp_data { struct rcu_head rcu_head; int len; u8 data[]; }; struct ps_data { /* yes, this looks ugly, but guarantees that we can later use * bitmap_empty :) * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */ u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)] __aligned(__alignof__(unsigned long)); struct sk_buff_head bc_buf; atomic_t num_sta_ps; /* number of stations in PS mode */ int dtim_count; bool dtim_bc_mc; }; struct ieee80211_if_ap { struct beacon_data __rcu *beacon; struct probe_resp __rcu *probe_resp; struct fils_discovery_data __rcu *fils_discovery; struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp; /* to be used after channel switch. */ struct cfg80211_beacon_data *next_beacon; struct list_head vlans; /* write-protected with RTNL and local->mtx */ struct ps_data ps; atomic_t num_mcast_sta; /* number of stations receiving multicast */ bool multicast_to_unicast; }; struct ieee80211_if_vlan { struct list_head list; /* write-protected with RTNL and local->mtx */ /* used for all tx if the VLAN is configured to 4-addr mode */ struct sta_info __rcu *sta; atomic_t num_mcast_sta; /* number of stations receiving multicast */ }; struct mesh_stats { __u32 fwded_mcast; /* Mesh forwarded multicast frames */ __u32 fwded_unicast; /* Mesh forwarded unicast frames */ __u32 fwded_frames; /* Mesh total forwarded frames */ __u32 dropped_frames_ttl; /* Not transmitted since mesh_ttl == 0*/ __u32 dropped_frames_no_route; /* Not transmitted, no route found */ __u32 dropped_frames_congestion;/* Not forwarded due to congestion */ }; #define PREQ_Q_F_START 0x1 #define PREQ_Q_F_REFRESH 0x2 struct mesh_preq_queue { struct list_head list; u8 dst[ETH_ALEN]; u8 flags; }; struct ieee80211_roc_work { struct list_head list; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *chan; bool started, abort, hw_begun, notified; bool on_channel; unsigned long start_time; u32 duration, req_duration; struct sk_buff *frame; u64 cookie, mgmt_tx_cookie; enum ieee80211_roc_type type; }; /* flags used in struct ieee80211_if_managed.flags */ enum ieee80211_sta_flags { IEEE80211_STA_CONNECTION_POLL = BIT(1), IEEE80211_STA_CONTROL_PORT = BIT(2), IEEE80211_STA_DISABLE_HT = BIT(4), IEEE80211_STA_MFP_ENABLED = BIT(6), IEEE80211_STA_UAPSD_ENABLED = BIT(7), IEEE80211_STA_NULLFUNC_ACKED = BIT(8), IEEE80211_STA_RESET_SIGNAL_AVE = BIT(9), IEEE80211_STA_DISABLE_40MHZ = BIT(10), IEEE80211_STA_DISABLE_VHT = BIT(11), IEEE80211_STA_DISABLE_80P80MHZ = BIT(12), IEEE80211_STA_DISABLE_160MHZ = BIT(13), IEEE80211_STA_DISABLE_WMM = BIT(14), IEEE80211_STA_ENABLE_RRM = BIT(15), IEEE80211_STA_DISABLE_HE = BIT(16), }; struct ieee80211_mgd_auth_data { struct cfg80211_bss *bss; unsigned long timeout; int tries; u16 algorithm, expected_transaction; u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; bool done, waiting; bool peer_confirmed; bool timeout_started; u16 sae_trans, sae_status; size_t data_len; u8 data[]; }; struct ieee80211_mgd_assoc_data { struct cfg80211_bss *bss; const u8 *supp_rates; unsigned long timeout; int tries; u16 capability; u8 prev_bssid[ETH_ALEN]; u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len; u8 supp_rates_len; bool wmm, uapsd; bool need_beacon; bool synced; bool timeout_started; u8 ap_ht_param; struct ieee80211_vht_cap ap_vht_cap; u8 fils_nonces[2 * FILS_NONCE_LEN]; u8 fils_kek[FILS_MAX_KEK_LEN]; size_t fils_kek_len; size_t ie_len; u8 ie[]; }; struct ieee80211_sta_tx_tspec { /* timestamp of the first packet in the time slice */ unsigned long time_slice_start; u32 admitted_time; /* in usecs, unlike over the air */ u8 tsid; s8 up; /* signed to be able to invalidate with -1 during teardown */ /* consumed TX time in microseconds in the time slice */ u32 consumed_tx_time; enum { TX_TSPEC_ACTION_NONE = 0, TX_TSPEC_ACTION_DOWNGRADE, TX_TSPEC_ACTION_STOP_DOWNGRADE, } action; bool downgraded; }; DECLARE_EWMA(beacon_signal, 4, 4) struct ieee80211_if_managed { struct timer_list timer; struct timer_list conn_mon_timer; struct timer_list bcn_mon_timer; struct timer_list chswitch_timer; struct work_struct monitor_work; struct work_struct chswitch_work; struct work_struct beacon_connection_loss_work; struct work_struct csa_connection_drop_work; unsigned long beacon_timeout; unsigned long probe_timeout; int probe_send_count; bool nullfunc_failed; u8 connection_loss:1, driver_disconnect:1, reconnect:1; struct cfg80211_bss *associated; struct ieee80211_mgd_auth_data *auth_data; struct ieee80211_mgd_assoc_data *assoc_data; u8 bssid[ETH_ALEN] __aligned(2); bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ bool have_beacon; u8 dtim_period; enum ieee80211_smps_mode req_smps, /* requested smps mode */ driver_smps_mode; /* smps mode request */ struct work_struct request_smps_work; unsigned int flags; bool csa_waiting_bcn; bool csa_ignored_same_chan; bool beacon_crc_valid; u32 beacon_crc; bool status_acked; bool status_received; __le16 status_fc; enum { IEEE80211_MFP_DISABLED, IEEE80211_MFP_OPTIONAL, IEEE80211_MFP_REQUIRED } mfp; /* management frame protection */ /* * Bitmask of enabled u-apsd queues, * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association * to take effect. */ unsigned int uapsd_queues; /* * Maximum number of buffered frames AP can deliver during a * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar. * Needs a new association to take effect. */ unsigned int uapsd_max_sp_len; int wmm_last_param_set; int mu_edca_last_param_set; u8 use_4addr; s16 p2p_noa_index; struct ewma_beacon_signal ave_beacon_signal; /* * Number of Beacon frames used in ave_beacon_signal. This can be used * to avoid generating less reliable cqm events that would be based * only on couple of received frames. */ unsigned int count_beacon_signal; /* Number of times beacon loss was invoked. */ unsigned int beacon_loss_count; /* * Last Beacon frame signal strength average (ave_beacon_signal / 16) * that triggered a cqm event. 0 indicates that no event has been * generated for the current association. */ int last_cqm_event_signal; /* * State variables for keeping track of RSSI of the AP currently * connected to and informing driver when RSSI has gone * below/above a certain threshold. */ int rssi_min_thold, rssi_max_thold; int last_ave_beacon_signal; struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */ struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */ struct ieee80211_s1g_cap s1g_capa; /* configured S1G overrides */ struct ieee80211_s1g_cap s1g_capa_mask; /* valid s1g_capa bits */ /* TDLS support */ u8 tdls_peer[ETH_ALEN] __aligned(2); struct delayed_work tdls_peer_del_work; struct sk_buff *orig_teardown_skb; /* The original teardown skb */ struct sk_buff *teardown_skb; /* A copy to send through the AP */ spinlock_t teardown_lock; /* To lock changing teardown_skb */ bool tdls_chan_switch_prohibited; bool tdls_wider_bw_prohibited; /* WMM-AC TSPEC support */ struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS]; /* Use a separate work struct so that we can do something here * while the sdata->work is flushing the queues, for example. * otherwise, in scenarios where we hardly get any traffic out * on the BE queue, but there's a lot of VO traffic, we might * get stuck in a downgraded situation and flush takes forever. */ struct delayed_work tx_tspec_wk; /* Information elements from the last transmitted (Re)Association * Request frame. */ u8 *assoc_req_ies; size_t assoc_req_ies_len; }; struct ieee80211_if_ibss { struct timer_list timer; struct work_struct csa_connection_drop_work; unsigned long last_scan_completed; u32 basic_rates; bool fixed_bssid; bool fixed_channel; bool privacy; bool control_port; bool userspace_handles_dfs; u8 bssid[ETH_ALEN] __aligned(2); u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len, ie_len; u8 *ie; struct cfg80211_chan_def chandef; unsigned long ibss_join_req; /* probe response/beacon for IBSS */ struct beacon_data __rcu *presp; struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ spinlock_t incomplete_lock; struct list_head incomplete_stations; enum { IEEE80211_IBSS_MLME_SEARCH, IEEE80211_IBSS_MLME_JOINED, } state; }; /** * struct ieee80211_if_ocb - OCB mode state * * @housekeeping_timer: timer for periodic invocation of a housekeeping task * @wrkq_flags: OCB deferred task action * @incomplete_lock: delayed STA insertion lock * @incomplete_stations: list of STAs waiting for delayed insertion * @joined: indication if the interface is connected to an OCB network */ struct ieee80211_if_ocb { struct timer_list housekeeping_timer; unsigned long wrkq_flags; spinlock_t incomplete_lock; struct list_head incomplete_stations; bool joined; }; /** * struct ieee80211_mesh_sync_ops - Extensible synchronization framework interface * * these declarations define the interface, which enables * vendor-specific mesh synchronization * */ struct ieee802_11_elems; struct ieee80211_mesh_sync_ops { void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata, u16 stype, struct ieee80211_mgmt *mgmt, unsigned int len, const struct ieee80211_meshconf_ie *mesh_cfg, struct ieee80211_rx_status *rx_status); /* should be called with beacon_data under RCU read lock */ void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon); /* add other framework functions here */ }; struct mesh_csa_settings { struct rcu_head rcu_head; struct cfg80211_csa_settings settings; }; /** * struct mesh_table * * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. * @gates_lock: protects updates to known_gates * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr * @walk_head: linked list containing all mesh_path objects * @walk_lock: lock protecting walk_head * @entries: number of entries in the table */ struct mesh_table { struct hlist_head known_gates; spinlock_t gates_lock; struct rhashtable rhead; struct hlist_head walk_head; spinlock_t walk_lock; atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ }; struct ieee80211_if_mesh { struct timer_list housekeeping_timer; struct timer_list mesh_path_timer; struct timer_list mesh_path_root_timer; unsigned long wrkq_flags; unsigned long mbss_changed; bool userspace_handles_dfs; u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN]; size_t mesh_id_len; /* Active Path Selection Protocol Identifier */ u8 mesh_pp_id; /* Active Path Selection Metric Identifier */ u8 mesh_pm_id; /* Congestion Control Mode Identifier */ u8 mesh_cc_id; /* Synchronization Protocol Identifier */ u8 mesh_sp_id; /* Authentication Protocol Identifier */ u8 mesh_auth_id; /* Local mesh Sequence Number */ u32 sn; /* Last used PREQ ID */ u32 preq_id; atomic_t mpaths; /* Timestamp of last SN update */ unsigned long last_sn_update; /* Time when it's ok to send next PERR */ unsigned long next_perr; /* Timestamp of last PREQ sent */ unsigned long last_preq; struct mesh_rmc *rmc; spinlock_t mesh_preq_queue_lock; struct mesh_preq_queue preq_queue; int preq_queue_len; struct mesh_stats mshstats; struct mesh_config mshcfg; atomic_t estab_plinks; u32 mesh_seqnum; bool accepting_plinks; int num_gates; struct beacon_data __rcu *beacon; const u8 *ie; u8 ie_len; enum { IEEE80211_MESH_SEC_NONE = 0x0, IEEE80211_MESH_SEC_AUTHED = 0x1, IEEE80211_MESH_SEC_SECURED = 0x2, } security; bool user_mpm; /* Extensible Synchronization Framework */ const struct ieee80211_mesh_sync_ops *sync_ops; s64 sync_offset_clockdrift_max; spinlock_t sync_offset_lock; /* mesh power save */ enum nl80211_mesh_power_mode nonpeer_pm; int ps_peers_light_sleep; int ps_peers_deep_sleep; struct ps_data ps; /* Channel Switching Support */ struct mesh_csa_settings __rcu *csa; enum { IEEE80211_MESH_CSA_ROLE_NONE, IEEE80211_MESH_CSA_ROLE_INIT, IEEE80211_MESH_CSA_ROLE_REPEATER, } csa_role; u8 chsw_ttl; u16 pre_value; /* offset from skb->data while building IE */ int meshconf_offset; struct mesh_table mesh_paths; struct mesh_table mpp_paths; /* Store paths for MPP&MAP */ int mesh_paths_generation; int mpp_paths_generation; }; #ifdef CONFIG_MAC80211_MESH #define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ do { (msh)->mshstats.name++; } while (0) #else #define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ do { } while (0) #endif /** * enum ieee80211_sub_if_data_flags - virtual interface flags * * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between * associated stations and deliver multicast frames both * back to wireless media and to the local net stack. * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume. * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver * @IEEE80211_SDATA_DISCONNECT_HW_RESTART: Disconnect after hardware restart * recovery */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), IEEE80211_SDATA_OPERATING_GMODE = BIT(2), IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), IEEE80211_SDATA_IN_DRIVER = BIT(5), IEEE80211_SDATA_DISCONNECT_HW_RESTART = BIT(6), }; /** * enum ieee80211_sdata_state_bits - virtual interface state bits * @SDATA_STATE_RUNNING: virtual interface is up & running; this * mirrors netif_running() but is separate for interface type * change handling while the interface is up * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel * mode, so queues are stopped * @SDATA_STATE_OFFCHANNEL_BEACON_STOPPED: Beaconing was stopped due * to offchannel, reset when offchannel returns */ enum ieee80211_sdata_state_bits { SDATA_STATE_RUNNING, SDATA_STATE_OFFCHANNEL, SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, }; /** * enum ieee80211_chanctx_mode - channel context configuration mode * * @IEEE80211_CHANCTX_SHARED: channel context may be used by * multiple interfaces * @IEEE80211_CHANCTX_EXCLUSIVE: channel context can be used * only by a single interface. This can be used for example for * non-fixed channel IBSS. */ enum ieee80211_chanctx_mode { IEEE80211_CHANCTX_SHARED, IEEE80211_CHANCTX_EXCLUSIVE }; /** * enum ieee80211_chanctx_replace_state - channel context replacement state * * This is used for channel context in-place reservations that require channel * context switch/swap. * * @IEEE80211_CHANCTX_REPLACE_NONE: no replacement is taking place * @IEEE80211_CHANCTX_WILL_BE_REPLACED: this channel context will be replaced * by a (not yet registered) channel context pointed by %replace_ctx. * @IEEE80211_CHANCTX_REPLACES_OTHER: this (not yet registered) channel context * replaces an existing channel context pointed to by %replace_ctx. */ enum ieee80211_chanctx_replace_state { IEEE80211_CHANCTX_REPLACE_NONE, IEEE80211_CHANCTX_WILL_BE_REPLACED, IEEE80211_CHANCTX_REPLACES_OTHER, }; struct ieee80211_chanctx { struct list_head list; struct rcu_head rcu_head; struct list_head assigned_vifs; struct list_head reserved_vifs; enum ieee80211_chanctx_replace_state replace_state; struct ieee80211_chanctx *replace_ctx; enum ieee80211_chanctx_mode mode; bool driver_present; struct ieee80211_chanctx_conf conf; }; struct mac80211_qos_map { struct cfg80211_qos_map qos_map; struct rcu_head rcu_head; }; enum txq_info_flags { IEEE80211_TXQ_STOP, IEEE80211_TXQ_AMPDU, IEEE80211_TXQ_NO_AMSDU, IEEE80211_TXQ_STOP_NETIF_TX, }; /** * struct txq_info - per tid queue * * @tin: contains packets split into multiple flows * @def_flow: used as a fallback flow when a packet destined to @tin hashes to * a fq_flow which is already owned by a different tin * @def_cvars: codel vars for @def_flow * @schedule_order: used with ieee80211_local->active_txqs * @frags: used to keep fragments created after dequeue */ struct txq_info { struct fq_tin tin; struct codel_vars def_cvars; struct codel_stats cstats; struct rb_node schedule_order; struct sk_buff_head frags; unsigned long flags; /* keep last! */ struct ieee80211_txq txq; }; struct ieee80211_if_mntr { u32 flags; u8 mu_follow_addr[ETH_ALEN] __aligned(2); struct list_head list; }; /** * struct ieee80211_if_nan - NAN state * * @conf: current NAN configuration * @func_ids: a bitmap of available instance_id's */ struct ieee80211_if_nan { struct cfg80211_nan_conf conf; /* protects function_inst_ids */ spinlock_t func_lock; struct idr function_inst_ids; }; struct ieee80211_sub_if_data { struct list_head list; struct wireless_dev wdev; /* keys */ struct list_head key_list; /* count for keys needing tailroom space allocation */ int crypto_tx_tailroom_needed_cnt; int crypto_tx_tailroom_pending_dec; struct delayed_work dec_tailroom_needed_wk; struct net_device *dev; struct ieee80211_local *local; unsigned int flags; unsigned long state; char name[IFNAMSIZ]; struct ieee80211_fragment_cache frags; /* TID bitmap for NoAck policy */ u16 noack_map; /* bit field of ACM bits (BIT(802.1D tag)) */ u8 wmm_acm; struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS]; struct ieee80211_key __rcu *default_unicast_key; struct ieee80211_key __rcu *default_multicast_key; struct ieee80211_key __rcu *default_mgmt_key; struct ieee80211_key __rcu *default_beacon_key; u16 sequence_number; __be16 control_port_protocol; bool control_port_no_encrypt; bool control_port_no_preauth; bool control_port_over_nl80211; int encrypt_headroom; atomic_t num_tx_queued; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct mac80211_qos_map __rcu *qos_map; struct airtime_info airtime[IEEE80211_NUM_ACS]; struct work_struct csa_finalize_work; bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */ struct cfg80211_chan_def csa_chandef; struct work_struct color_change_finalize_work; struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */ struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */ /* context reservation -- protected with chanctx_mtx */ struct ieee80211_chanctx *reserved_chanctx; struct cfg80211_chan_def reserved_chandef; bool reserved_radar_required; bool reserved_ready; /* used to reconfigure hardware SM PS */ struct work_struct recalc_smps; struct work_struct work; struct sk_buff_head skb_queue; struct sk_buff_head status_queue; u8 needed_rx_chains; enum ieee80211_smps_mode smps_mode; int user_power_level; /* in dBm */ int ap_power_level; /* in dBm */ bool radar_required; struct delayed_work dfs_cac_timer_work; /* * AP this belongs to: self in AP mode and * corresponding AP in VLAN mode, NULL for * all others (might be needed later in IBSS) */ struct ieee80211_if_ap *bss; /* bitmap of allowed (non-MCS) rate indexes for rate control */ u32 rc_rateidx_mask[NUM_NL80211_BANDS]; bool rc_has_mcs_mask[NUM_NL80211_BANDS]; u8 rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN]; bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; /* Beacon frame (non-MCS) rate (as a bitmap) */ u32 beacon_rateidx_mask[NUM_NL80211_BANDS]; bool beacon_rate_set; union { struct ieee80211_if_ap ap; struct ieee80211_if_vlan vlan; struct ieee80211_if_managed mgd; struct ieee80211_if_ibss ibss; struct ieee80211_if_mesh mesh; struct ieee80211_if_ocb ocb; struct ieee80211_if_mntr mntr; struct ieee80211_if_nan nan; } u; #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *subdir_stations; struct dentry *default_unicast_key; struct dentry *default_multicast_key; struct dentry *default_mgmt_key; struct dentry *default_beacon_key; } debugfs; #endif /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; static inline struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) { return container_of(p, struct ieee80211_sub_if_data, vif); } static inline void sdata_lock(struct ieee80211_sub_if_data *sdata) __acquires(&sdata->wdev.mtx) { mutex_lock(&sdata->wdev.mtx); __acquire(&sdata->wdev.mtx); } static inline void sdata_unlock(struct ieee80211_sub_if_data *sdata) __releases(&sdata->wdev.mtx) { mutex_unlock(&sdata->wdev.mtx); __release(&sdata->wdev.mtx); } #define sdata_dereference(p, sdata) \ rcu_dereference_protected(p, lockdep_is_held(&sdata->wdev.mtx)) static inline void sdata_assert_lock(struct ieee80211_sub_if_data *sdata) { lockdep_assert_held(&sdata->wdev.mtx); } static inline int ieee80211_chandef_get_shift(struct cfg80211_chan_def *chandef) { switch (chandef->width) { case NL80211_CHAN_WIDTH_5: return 2; case NL80211_CHAN_WIDTH_10: return 1; default: return 0; } } static inline int ieee80211_vif_get_shift(struct ieee80211_vif *vif) { struct ieee80211_chanctx_conf *chanctx_conf; int shift = 0; rcu_read_lock(); chanctx_conf = rcu_dereference(vif->chanctx_conf); if (chanctx_conf) shift = ieee80211_chandef_get_shift(&chanctx_conf->def); rcu_read_unlock(); return shift; } enum { IEEE80211_RX_MSG = 1, IEEE80211_TX_STATUS_MSG = 2, }; enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_DRIVER, IEEE80211_QUEUE_STOP_REASON_PS, IEEE80211_QUEUE_STOP_REASON_CSA, IEEE80211_QUEUE_STOP_REASON_AGGREGATION, IEEE80211_QUEUE_STOP_REASON_SUSPEND, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, IEEE80211_QUEUE_STOP_REASON_FLUSH, IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID, IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE, IEEE80211_QUEUE_STOP_REASONS, }; #ifdef CONFIG_MAC80211_LEDS struct tpt_led_trigger { char name[32]; const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; struct timer_list timer; struct ieee80211_local *local; unsigned long prev_traffic; unsigned long tx_bytes, rx_bytes; unsigned int active, want; bool running; }; #endif /** * mac80211 scan flags - currently active scan mode * * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as * well be on the operating channel * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to * determine if we are on the operating channel or not * @SCAN_ONCHANNEL_SCANNING: Do a software scan on only the current operating * channel. This should not interrupt normal traffic. * @SCAN_COMPLETED: Set for our scan work function when the driver reported * that the scan completed. * @SCAN_ABORTED: Set for our scan work function when the driver reported * a scan complete for an aborted scan. * @SCAN_HW_CANCELLED: Set for our scan work function when the scan is being * cancelled. * @SCAN_BEACON_WAIT: Set whenever we're passive scanning because of radar/no-IR * and could send a probe request after receiving a beacon. * @SCAN_BEACON_DONE: Beacon received, we can now send a probe request */ enum { SCAN_SW_SCANNING, SCAN_HW_SCANNING, SCAN_ONCHANNEL_SCANNING, SCAN_COMPLETED, SCAN_ABORTED, SCAN_HW_CANCELLED, SCAN_BEACON_WAIT, SCAN_BEACON_DONE, }; /** * enum mac80211_scan_state - scan state machine states * * @SCAN_DECISION: Main entry point to the scan state machine, this state * determines if we should keep on scanning or switch back to the * operating channel * @SCAN_SET_CHANNEL: Set the next channel to be scanned * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses * @SCAN_SUSPEND: Suspend the scan and go back to operating channel to * send out data * @SCAN_RESUME: Resume the scan and scan the next channel * @SCAN_ABORT: Abort the scan and go back to operating channel */ enum mac80211_scan_state { SCAN_DECISION, SCAN_SET_CHANNEL, SCAN_SEND_PROBE, SCAN_SUSPEND, SCAN_RESUME, SCAN_ABORT, }; /** * struct airtime_sched_info - state used for airtime scheduling and AQL * * @lock: spinlock that protects all the fields in this struct * @active_txqs: rbtree of currently backlogged queues, sorted by virtual time * @schedule_pos: the current position maintained while a driver walks the tree * with ieee80211_next_txq() * @active_list: list of struct airtime_info structs that were active within * the last AIRTIME_ACTIVE_DURATION (100 ms), used to compute * weight_sum * @last_weight_update: used for rate limiting walking active_list * @last_schedule_time: tracks the last time a transmission was scheduled; used * for catching up v_t if no stations are eligible for * transmission. * @v_t: global virtual time; queues with v_t < this are eligible for * transmission * @weight_sum: total sum of all active stations used for dividing airtime * @weight_sum_reciprocal: reciprocal of weight_sum (to avoid divisions in fast * path - see comment above * IEEE80211_RECIPROCAL_DIVISOR_64) * @aql_txq_limit_low: AQL limit when total outstanding airtime * is < IEEE80211_AQL_THRESHOLD * @aql_txq_limit_high: AQL limit when total outstanding airtime * is > IEEE80211_AQL_THRESHOLD */ struct airtime_sched_info { spinlock_t lock; struct rb_root_cached active_txqs; struct rb_node *schedule_pos; struct list_head active_list; u64 last_weight_update; u64 last_schedule_activity; u64 v_t; u64 weight_sum; u64 weight_sum_reciprocal; u32 aql_txq_limit_low; u32 aql_txq_limit_high; }; DECLARE_STATIC_KEY_FALSE(aql_disable); struct ieee80211_local { /* embed the driver visible part. * don't cast (use the static inlines below), but we keep * it first anyway so they become a no-op */ struct ieee80211_hw hw; struct fq fq; struct codel_vars *cvars; struct codel_params cparams; /* protects active_txqs and txqi->schedule_order */ struct airtime_sched_info airtime[IEEE80211_NUM_ACS]; u16 airtime_flags; u32 aql_threshold; atomic_t aql_total_pending_airtime; const struct ieee80211_ops *ops; /* * private workqueue to mac80211. mac80211 makes this accessible * via ieee80211_queue_work() */ struct workqueue_struct *workqueue; unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; int q_stop_reasons[IEEE80211_MAX_QUEUES][IEEE80211_QUEUE_STOP_REASONS]; /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ spinlock_t queue_stop_reason_lock; int open_count; int monitors, cooked_mntrs; /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; bool probe_req_reg; bool rx_mcast_action_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; bool use_chanctx; /* protects the aggregated multicast list and filter calls */ spinlock_t filter_lock; /* used for uploading changed mc list */ struct work_struct reconfig_filter; /* aggregated multicast list */ struct netdev_hw_addr_list mc_list; bool tim_in_locked_section; /* see ieee80211_beacon_get() */ /* * suspended is true if we finished all the suspend _and_ we have * not yet come up from resume. This is to be used by mac80211 * to ensure driver sanity during suspend and mac80211's own * sanity. It can eventually be used for WoW as well. */ bool suspended; /* * Resuming is true while suspended, but when we're reprogramming the * hardware -- at that time it's allowed to use ieee80211_queue_work() * again even though some other parts of the stack are still suspended * and we still drop received frames to avoid waking the stack. */ bool resuming; /* * quiescing is true during the suspend process _only_ to * ease timer cancelling etc. */ bool quiescing; /* device is started */ bool started; /* device is during a HW reconfig */ bool in_reconfig; /* wowlan is enabled -- don't reconfig on resume */ bool wowlan; struct work_struct radar_detected_work; /* number of RX chains the hardware has */ u8 rx_chains; /* bitmap of which sbands were copied */ u8 sband_allocated; int tx_headroom; /* required headroom for hardware/radiotap */ /* Tasklet and skb queue to process calls from IRQ mode. All frames * added to skb_queue will be processed, but frames in * skb_queue_unreliable may be dropped if the total length of these * queues increases over the limit. */ #define IEEE80211_IRQSAFE_QUEUE_LIMIT 128 struct tasklet_struct tasklet; struct sk_buff_head skb_queue; struct sk_buff_head skb_queue_unreliable; spinlock_t rx_path_lock; /* Station data */ /* * The mutex only protects the list, hash table and * counter, reads are done with RCU. */ struct mutex sta_mtx; spinlock_t tim_lock; unsigned long num_sta; struct list_head sta_list; struct rhltable sta_hash; struct timer_list sta_cleanup; int sta_generation; struct sk_buff_head pending[IEEE80211_MAX_QUEUES]; struct tasklet_struct tx_pending_tasklet; struct tasklet_struct wake_txqs_tasklet; atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; /* number of interfaces with allmulti RX */ atomic_t iff_allmultis; struct rate_control_ref *rate_ctrl; struct arc4_ctx wep_tx_ctx; struct arc4_ctx wep_rx_ctx; u32 wep_iv; /* see iface.c */ struct list_head interfaces; struct list_head mon_list; /* only that are IFF_UP && !cooked */ struct mutex iflist_mtx; /* * Key mutex, protects sdata's key_list and sta_info's * key pointers and ptk_idx (write access, they're RCU.) */ struct mutex key_mtx; /* mutex for scan and work locking */ struct mutex mtx; /* Scanning and BSS list */ unsigned long scanning; struct cfg80211_ssid scan_ssid; struct cfg80211_scan_request *int_scan_req; struct cfg80211_scan_request __rcu *scan_req; struct ieee80211_scan_request *hw_scan_req; struct cfg80211_chan_def scan_chandef; enum nl80211_band hw_scan_band; int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; struct cfg80211_scan_info scan_info; struct work_struct sched_scan_stopped_work; struct ieee80211_sub_if_data __rcu *sched_scan_sdata; struct cfg80211_sched_scan_request __rcu *sched_scan_req; u8 scan_addr[ETH_ALEN]; unsigned long leave_oper_channel_time; enum mac80211_scan_state next_scan_state; struct delayed_work scan_work; struct ieee80211_sub_if_data __rcu *scan_sdata; /* For backward compatibility only -- do not use */ struct cfg80211_chan_def _oper_chandef; /* Temporary remain-on-channel for off-channel operations */ struct ieee80211_channel *tmp_channel; /* channel contexts */ struct list_head chanctx_list; struct mutex chanctx_mtx; #ifdef CONFIG_MAC80211_LEDS struct led_trigger tx_led, rx_led, assoc_led, radio_led; struct led_trigger tpt_led; atomic_t tx_led_active, rx_led_active, assoc_led_active; atomic_t radio_led_active, tpt_led_active; struct tpt_led_trigger *tpt_led_trigger; #endif #ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* SNMP counters */ /* dot11CountersTable */ u32 dot11TransmittedFragmentCount; u32 dot11MulticastTransmittedFrameCount; u32 dot11FailedCount; u32 dot11RetryCount; u32 dot11MultipleRetryCount; u32 dot11FrameDuplicateCount; u32 dot11ReceivedFragmentCount; u32 dot11MulticastReceivedFrameCount; u32 dot11TransmittedFrameCount; /* TX/RX handler statistics */ unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; unsigned int tx_handlers_drop_unauth_port; unsigned int rx_handlers_drop; unsigned int rx_handlers_queued; unsigned int rx_handlers_drop_nullfunc; unsigned int rx_handlers_drop_defrag; unsigned int tx_expand_skb_head; unsigned int tx_expand_skb_head_cloned; unsigned int rx_expand_skb_head_defrag; unsigned int rx_handlers_fragments; unsigned int tx_status_drop; #define I802_DEBUG_INC(c) (c)++ #else /* CONFIG_MAC80211_DEBUG_COUNTERS */ #define I802_DEBUG_INC(c) do { } while (0) #endif /* CONFIG_MAC80211_DEBUG_COUNTERS */ int total_ps_buffered; /* total number of all buffered unicast and * multicast packets for power saving stations */ bool pspolling; /* * PS can only be enabled when we have exactly one managed * interface (and monitors) in PS, this then points there. */ struct ieee80211_sub_if_data *ps_sdata; struct work_struct dynamic_ps_enable_work; struct work_struct dynamic_ps_disable_work; struct timer_list dynamic_ps_timer; struct notifier_block ifa_notifier; struct notifier_block ifa6_notifier; /* * The dynamic ps timeout configured from user space via WEXT - * this will override whatever chosen by mac80211 internally. */ int dynamic_ps_forced_timeout; int user_power_level; /* in dBm, for all interfaces */ enum ieee80211_smps_mode smps_mode; struct work_struct restart_work; #ifdef CONFIG_MAC80211_DEBUGFS struct local_debugfsdentries { struct dentry *rcdir; struct dentry *keys; } debugfs; bool force_tx_status; #endif /* * Remain-on-channel support */ struct delayed_work roc_work; struct list_head roc_list; struct work_struct hw_roc_start, hw_roc_done; unsigned long hw_roc_start_time; u64 roc_cookie_counter; struct idr ack_status_frames; spinlock_t ack_status_lock; struct ieee80211_sub_if_data __rcu *p2p_sdata; /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct cfg80211_chan_def monitor_chandef; /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; }; static inline struct ieee80211_sub_if_data * IEEE80211_DEV_TO_SUB_IF(struct net_device *dev) { return netdev_priv(dev); } static inline struct ieee80211_sub_if_data * IEEE80211_WDEV_TO_SUB_IF(struct wireless_dev *wdev) { return container_of(wdev, struct ieee80211_sub_if_data, wdev); } static inline struct ieee80211_supported_band * ieee80211_get_sband(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return NULL; } band = chanctx_conf->def.chan->band; rcu_read_unlock(); return local->hw.wiphy->bands[band]; } /* this struct holds the value parsing from channel switch IE */ struct ieee80211_csa_ie { struct cfg80211_chan_def chandef; u8 mode; u8 count; u8 ttl; u16 pre_value; u16 reason_code; u32 max_switch_time; }; /* Parsed Information Elements */ struct ieee802_11_elems { const u8 *ie_start; size_t total_len; u32 crc; /* pointers to IEs */ const struct ieee80211_tdls_lnkie *lnk_id; const struct ieee80211_ch_switch_timing *ch_sw_timing; const u8 *ext_capab; const u8 *ssid; const u8 *supp_rates; const u8 *ds_params; const struct ieee80211_tim_ie *tim; const u8 *rsn; const u8 *rsnx; const u8 *erp_info; const u8 *ext_supp_rates; const u8 *wmm_info; const u8 *wmm_param; const struct ieee80211_ht_cap *ht_cap_elem; const struct ieee80211_ht_operation *ht_operation; const struct ieee80211_vht_cap *vht_cap_elem; const struct ieee80211_vht_operation *vht_operation; const struct ieee80211_meshconf_ie *mesh_config; const u8 *he_cap; const struct ieee80211_he_operation *he_operation; const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const struct ieee80211_tx_pwr_env *tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; const __le16 *awake_window; const u8 *preq; const u8 *prep; const u8 *perr; const struct ieee80211_rann_ie *rann; const struct ieee80211_channel_sw_ie *ch_switch_ie; const struct ieee80211_ext_chansw_ie *ext_chansw_ie; const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const u8 *max_channel_switch_time; const u8 *country_elem; const u8 *pwr_constr_elem; const u8 *cisco_dtpc_elem; const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie; const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie; const struct ieee80211_multiple_bssid_configuration *mbssid_config_ie; const struct ieee80211_bssid_index *bssid_index; u8 max_bssid_indicator; u8 dtim_count; u8 dtim_period; const struct ieee80211_addba_ext_ie *addba_ext_ie; const struct ieee80211_s1g_cap *s1g_capab; const struct ieee80211_s1g_oper_ie *s1g_oper; const struct ieee80211_s1g_bcn_compat_ie *s1g_bcn_compat; const struct ieee80211_aid_response_ie *aid_resp; /* length of them, respectively */ u8 ext_capab_len; u8 ssid_len; u8 supp_rates_len; u8 tim_len; u8 rsn_len; u8 rsnx_len; u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; u8 he_cap_len; u8 mesh_id_len; u8 peering_len; u8 preq_len; u8 prep_len; u8 perr_len; u8 country_elem_len; u8 bssid_index_len; u8 tx_pwr_env_len[IEEE80211_TPE_MAX_IE_COUNT]; u8 tx_pwr_env_num; /* whether a parse error occurred while retrieving these elements */ bool parse_error; /* * scratch buffer that can be used for various element parsing related * tasks, e.g., element de-fragmentation etc. */ size_t scratch_len; u8 *scratch_pos; u8 scratch[]; }; static inline struct ieee80211_local *hw_to_local( struct ieee80211_hw *hw) { return container_of(hw, struct ieee80211_local, hw); } static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq) { return container_of(txq, struct txq_info, txq); } static inline bool txq_has_queue(struct ieee80211_txq *txq) { struct txq_info *txqi = to_txq_info(txq); return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets); } static inline struct airtime_info *to_airtime_info(struct ieee80211_txq *txq) { struct ieee80211_sub_if_data *sdata; struct sta_info *sta; if (txq->sta) { sta = container_of(txq->sta, struct sta_info, sta); return &sta->airtime[txq->ac]; } sdata = vif_to_sdata(txq->vif); return &sdata->airtime[txq->ac]; } /* To avoid divisions in the fast path, we keep pre-computed reciprocals for * airtime weight calculations. There are two different weights to keep track * of: The per-station weight and the sum of weights per phy. * * For the per-station weights (kept in airtime_info below), we use 32-bit * reciprocals with a devisor of 2^19. This lets us keep the multiplications and * divisions for the station weights as 32-bit operations at the cost of a bit * of rounding error for high weights; but the choice of divisor keeps rounding * errors <10% for weights <2^15, assuming no more than 8ms of airtime is * reported at a time. * * For the per-phy sum of weights the values can get higher, so we use 64-bit * operations for those with a 32-bit divisor, which should avoid any * significant rounding errors. */ #define IEEE80211_RECIPROCAL_DIVISOR_64 0x100000000ULL #define IEEE80211_RECIPROCAL_SHIFT_64 32 #define IEEE80211_RECIPROCAL_DIVISOR_32 0x80000U #define IEEE80211_RECIPROCAL_SHIFT_32 19 static inline void airtime_weight_set(struct airtime_info *air_info, u16 weight) { if (air_info->weight == weight) return; air_info->weight = weight; if (weight) { air_info->weight_reciprocal = IEEE80211_RECIPROCAL_DIVISOR_32 / weight; } else { air_info->weight_reciprocal = 0; } } static inline void airtime_weight_sum_set(struct airtime_sched_info *air_sched, int weight_sum) { if (air_sched->weight_sum == weight_sum) return; air_sched->weight_sum = weight_sum; if (air_sched->weight_sum) { air_sched->weight_sum_reciprocal = IEEE80211_RECIPROCAL_DIVISOR_64; do_div(air_sched->weight_sum_reciprocal, air_sched->weight_sum); } else { air_sched->weight_sum_reciprocal = 0; } } /* A problem when trying to enforce airtime fairness is that we want to divide * the airtime between the currently *active* stations. However, basing this on * the instantaneous queue state of stations doesn't work, as queues tend to * oscillate very quickly between empty and occupied, leading to the scheduler * thinking only a single station is active when deciding whether to allow * transmission (and thus not throttling correctly). * * To fix this we use a timer-based notion of activity: a station is considered * active if it has been scheduled within the last 100 ms; we keep a separate * list of all the stations considered active in this manner, and lazily update * the total weight of active stations from this list (filtering the stations in * the list by their 'last active' time). * * We add one additional safeguard to guard against stations that manage to get * scheduled every 100 ms but don't transmit a lot of data, and thus don't use * up any airtime. Such stations would be able to get priority for an extended * period of time if they do start transmitting at full capacity again, and so * we add an explicit maximum for how far behind a station is allowed to fall in * the virtual airtime domain. This limit is set to a relatively high value of * 20 ms because the main mechanism for catching up idle stations is the active * state as described above; i.e., the hard limit should only be hit in * pathological cases. */ #define AIRTIME_ACTIVE_DURATION (100 * NSEC_PER_MSEC) #define AIRTIME_MAX_BEHIND 20000 /* 20 ms */ static inline bool airtime_is_active(struct airtime_info *air_info, u64 now) { return air_info->last_scheduled >= now - AIRTIME_ACTIVE_DURATION; } static inline void airtime_set_active(struct airtime_sched_info *air_sched, struct airtime_info *air_info, u64 now) { air_info->last_scheduled = now; air_sched->last_schedule_activity = now; list_move_tail(&air_info->list, &air_sched->active_list); } static inline bool airtime_catchup_v_t(struct airtime_sched_info *air_sched, u64 v_t, u64 now) { air_sched->v_t = v_t; return true; } static inline void init_airtime_info(struct airtime_info *air_info, struct airtime_sched_info *air_sched) { atomic_set(&air_info->aql_tx_pending, 0); air_info->aql_limit_low = air_sched->aql_txq_limit_low; air_info->aql_limit_high = air_sched->aql_txq_limit_high; airtime_weight_set(air_info, IEEE80211_DEFAULT_AIRTIME_WEIGHT); INIT_LIST_HEAD(&air_info->list); } static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) { return ether_addr_equal(raddr, addr) || is_broadcast_ether_addr(raddr); } static inline bool ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) { WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START && status->flag & RX_FLAG_MACTIME_END); return !!(status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END | RX_FLAG_MACTIME_PLCP_START)); } void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata); /* This function returns the number of multicast stations connected to this * interface. It returns -1 if that number is not tracked, that is for netdevs * not in AP or AP_VLAN mode or when using 4addr. */ static inline int ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_AP) return atomic_read(&sdata->u.ap.num_mcast_sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta) return atomic_read(&sdata->u.vlan.num_mcast_sta); return -1; } u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset); int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u32 changed); void ieee80211_configure_filter(struct ieee80211_local *local); u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); void ieee80211_handle_queued_frames(struct ieee80211_local *local); u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp); void ieee80211_check_fast_rx(struct sta_info *sta); void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_rx(struct sta_info *sta); /* STA code */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct cfg80211_auth_request *req); int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req); int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct cfg80211_deauth_request *req); int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_disassoc_request *req); void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_ps(struct ieee80211_local *local); void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata); int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, __le16 fc, bool acked); void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, u8 *bssid, u8 reason, bool tx); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates); int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params); int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings); int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata); /* OCB code */ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates); void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, struct ocb_setup *setup); int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata); /* mesh code */ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings); int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata); /* scan/BSS handling */ void ieee80211_scan_work(struct work_struct *work); int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels, enum nl80211_bss_scan_width scan_width); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); void ieee80211_scan_cancel(struct ieee80211_local *local); void ieee80211_run_deferred_scan(struct ieee80211_local *local); void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local); struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel); void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss); /* scheduled scan handling */ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req); int ieee80211_request_sched_scan_stop(struct ieee80211_local *local); void ieee80211_sched_scan_end(struct ieee80211_local *local); void ieee80211_sched_scan_stopped_work(struct work_struct *work); /* off-channel/mgmt-tx */ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local); void ieee80211_offchannel_return(struct ieee80211_local *local); void ieee80211_roc_setup(struct ieee80211_local *local); void ieee80211_start_next_roc(struct ieee80211_local *local); void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie); int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie); int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie); int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie); /* channel switch handling */ void ieee80211_csa_finalize_work(struct work_struct *work); int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params); /* color change handling */ void ieee80211_color_change_finalize_work(struct work_struct *work); /* interface handling */ #define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE) #define MAC80211_SUPPORTED_FEATURES_RX (NETIF_F_RXCSUM) #define MAC80211_SUPPORTED_FEATURES (MAC80211_SUPPORTED_FEATURES_TX | \ MAC80211_SUPPORTED_FEATURES_RX) int ieee80211_iface_init(void); void ieee80211_iface_exit(void); int ieee80211_if_add(struct ieee80211_local *local, const char *name, unsigned char name_assign_type, struct wireless_dev **new_wdev, enum nl80211_iftype type, struct vif_params *params); int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type); void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata); void ieee80211_remove_interfaces(struct ieee80211_local *local); u32 ieee80211_idle_off(struct ieee80211_local *local); void ieee80211_recalc_idle(struct ieee80211_local *local); void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset); int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up); void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata); int ieee80211_add_virtual_monitor(struct ieee80211_local *local); void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata, bool update_bss); void ieee80211_recalc_offload(struct ieee80211_local *local); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { return test_bit(SDATA_STATE_RUNNING, &sdata->state); } /* tx handling */ void ieee80211_clear_tx_pending(struct ieee80211_local *local); void ieee80211_tx_pending(struct tasklet_struct *t); netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct net_device *dev); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, u32 ctrl_flags, u64 *cookie); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs); struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_supported_band *sband, int retry_count, int shift, bool send_to_cooked, struct ieee80211_tx_status *status); void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); void ieee80211_resort_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq); void ieee80211_unschedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool purge); void ieee80211_update_airtime_weight(struct ieee80211_local *local, struct airtime_sched_info *air_sched, u64 now, bool force); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap); bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_cap *ht_cap_ie, struct sta_info *sta); void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code); int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid); void ieee80211_request_smps_ap_work(struct work_struct *work); void ieee80211_request_smps_mgd_work(struct work_struct *work); bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old, enum ieee80211_smps_mode smps_mode_new); void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void ___ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, const struct ieee80211_addba_ext_ie *addbaext); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_addba_resp(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_agg_stop_reason reason); int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_agg_stop_reason reason); void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid, struct tid_ampdu_tx *tid_tx); void ieee80211_ba_session_work(struct work_struct *work); void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs); enum nl80211_smps_mode ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps); /* VHT */ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); void ieee80211_sta_set_rx_nss(struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width); enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta); void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum nl80211_band band); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum nl80211_band band); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]); enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); /* HE */ void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta); void ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, const struct ieee80211_he_spr *he_spr_ie_elem); void ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, const struct ieee80211_he_operation *he_op_ie_elem); /* S1G */ void ieee80211_s1g_sta_rate_init(struct sta_info *sta); bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb); void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); /** * ieee80211_parse_ch_switch_ie - parses channel switch IEs * @sdata: the sdata of the interface which has received the frame * @elems: parsed 802.11 elements received with the frame * @current_band: indicates the current band * @vht_cap_info: VHT capabilities of the transmitter * @sta_flags: contains information about own capabilities and restrictions * to decide which channel switch announcements can be accepted. Only the * following subset of &enum ieee80211_sta_flags are evaluated: * %IEEE80211_STA_DISABLE_HT, %IEEE80211_STA_DISABLE_VHT, * %IEEE80211_STA_DISABLE_40MHZ, %IEEE80211_STA_DISABLE_80P80MHZ, * %IEEE80211_STA_DISABLE_160MHZ. * @bssid: the currently connected bssid (for reporting) * @csa_ie: parsed 802.11 csa elements on count, mode, chandef and mesh ttl. All of them will be filled with if success only. * Return: 0 on success, <0 on error and >0 if there is nothing to parse. */ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, u32 vht_cap_info, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie); /* Suspend/resume and hw reconfiguration */ int ieee80211_reconfig(struct ieee80211_local *local); void ieee80211_stop_device(struct ieee80211_local *local); int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); static inline int __ieee80211_resume(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) && !test_bit(SCAN_COMPLETED, &local->scanning), "%s: resume with hardware scan still in progress\n", wiphy_name(hw->wiphy)); return ieee80211_reconfig(hw_to_local(hw)); } /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac); void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, enum nl80211_band band); /* sta_out needs to be checked for ERR_PTR() before using */ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info **sta_out); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, enum nl80211_band band) { rcu_read_lock(); __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); rcu_read_unlock(); } static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid) { struct ieee80211_chanctx_conf *chanctx_conf; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } __ieee80211_tx_skb_tid_band(sdata, skb, tid, chanctx_conf->def.chan->band); rcu_read_unlock(); } static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */ ieee80211_tx_skb_tid(sdata, skb, 7); } struct ieee802_11_elems *ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, u64 filter, u32 crc, const u8 *transmitter_bssid, const u8 *bss_bssid); static inline struct ieee802_11_elems * ieee802_11_parse_elems(const u8 *start, size_t len, bool action, const u8 *transmitter_bssid, const u8 *bss_bssid) { return ieee802_11_parse_elems_crc(start, len, action, 0, 0, transmitter_bssid, bss_bssid); } extern const int ieee802_1d_to_ac[8]; static inline int ieee80211_ac_from_tid(int tid) { return ieee802_1d_to_ac[tid & 7]; } void ieee80211_dynamic_ps_enable_work(struct work_struct *work); void ieee80211_dynamic_ps_disable_work(struct work_struct *work); void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted); void ieee80211_stop_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason); void ieee80211_wake_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason); void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted); void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs); void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop); void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop); static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) { /* * It's unsafe to try to do any work during reconfigure flow. * When the flow ends the work will be requeued. */ if (local->in_reconfig) return false; /* * If quiescing is set, we are racing with __ieee80211_suspend. * __ieee80211_suspend flushes the workers after setting quiescing, * and we check quiescing / suspended before enqueing new workers. * We should abort the worker to avoid the races below. */ if (local->quiescing) return false; /* * We might already be suspended if the following scenario occurs: * __ieee80211_suspend Control path * * if (local->quiescing) * return; * local->quiescing = true; * flush_workqueue(); * queue_work(...); * local->suspended = true; * local->quiescing = false; * worker starts running... */ if (local->suspended) return false; return true; } int ieee80211_txq_setup_flows(struct ieee80211_local *local); void ieee80211_txq_set_params(struct ieee80211_local *local); void ieee80211_txq_teardown_flows(struct ieee80211_local *local); void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txq, int tid); void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi); void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi); void ieee80211_wake_txqs(struct tasklet_struct *t); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, const u8 *da, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags); void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); enum { IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1), IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), }; int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags); struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags); u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates); int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata); size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset); u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap); u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode); void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype); u8 *ieee80211_ie_build_he_cap(u32 disable_flags, u8 *pos, const struct ieee80211_sta_he_cap *he_cap, u8 *end); void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, enum nl80211_band band); int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, enum nl80211_band band); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb); void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_operation *he_oper, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); int __must_check ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode); int __must_check ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode, bool radar_required); int __must_check ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata); int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata); int __must_check ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 *changed); void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, bool clear); int ieee80211_chanctx_refcount(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); bool ieee80211_is_radar_required(struct ieee80211_local *local); void ieee80211_dfs_cac_timer(unsigned long data); void ieee80211_dfs_cac_timer_work(struct work_struct *work); void ieee80211_dfs_cac_cancel(struct ieee80211_local *local); void ieee80211_dfs_radar_detected_work(struct work_struct *work); int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings); bool ieee80211_cs_valid(const struct ieee80211_cipher_scheme *cs); bool ieee80211_cs_list_valid(const struct ieee80211_cipher_scheme *cs, int n); const struct ieee80211_cipher_scheme * ieee80211_cs_get(struct ieee80211_local *local, u32 cipher, enum nl80211_iftype iftype); int ieee80211_cs_headroom(struct ieee80211_local *local, struct cfg80211_crypto_settings *crypto, enum nl80211_iftype iftype); void ieee80211_recalc_dtim(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect); int ieee80211_max_num_channels(struct ieee80211_local *local); void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, struct ieee80211_chanctx *ctx); /* TDLS */ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *extra_ies, size_t extra_ies_len); int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper); void ieee80211_tdls_peer_del_work(struct work_struct *wk); int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef); void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata); void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason); void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); const char *ieee80211_get_reason_code_string(u16 reason_code); u16 ieee80211_encode_usf(int val); u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type); extern const struct ethtool_ops ieee80211_ethtool_ops; u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *pubsta, int len, bool ampdu); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else #define debug_noinline #endif void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache); void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache); #endif /* IEEE80211_I_H */
47 47 4 4 4 4 4 4 5 1 4 4 4 4 4 4 4 4 4 4 2 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * SMC statistics netlink routines * * Copyright IBM Corp. 2021 * * Author(s): Guvenc Gulce */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/ctype.h> #include <linux/smc.h> #include <net/genetlink.h> #include <net/sock.h> #include "smc_netlink.h" #include "smc_stats.h" int smc_stats_init(struct net *net) { net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); if (!net->smc.fback_rsn) goto err_fback; net->smc.smc_stats = alloc_percpu(struct smc_stats); if (!net->smc.smc_stats) goto err_stats; mutex_init(&net->smc.mutex_fback_rsn); return 0; err_stats: kfree(net->smc.fback_rsn); err_fback: return -ENOMEM; } void smc_stats_exit(struct net *net) { kfree(net->smc.fback_rsn); if (net->smc.smc_stats) free_percpu(net->smc.smc_stats); } static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_rmbcnt *stats_rmb_cnt; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TX_RMB_STATS) stats_rmb_cnt = &stats->smc[tech].rmb_tx; else stats_rmb_cnt = &stats->smc[tech].rmb_rx; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, stats_rmb_cnt->reuse_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, stats_rmb_cnt->buf_size_small_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, stats_rmb_cnt->buf_size_small_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, stats_rmb_cnt->buf_full_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, stats_rmb_cnt->buf_full_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, stats_rmb_cnt->alloc_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, stats_rmb_cnt->dgrade_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_memsize *stats_pload; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) stats_pload = &stats->smc[tech].tx_pd; else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) stats_pload = &stats->smc[tech].rx_pd; else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) stats_pload = &stats->smc[tech].tx_rmbsize; else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) stats_pload = &stats->smc[tech].rx_rmbsize; else goto errout; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, stats_pload->buf[SMC_BUF_8K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, stats_pload->buf[SMC_BUF_16K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, stats_pload->buf[SMC_BUF_32K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, stats_pload->buf[SMC_BUF_64K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, stats_pload->buf[SMC_BUF_128K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, stats_pload->buf[SMC_BUF_256K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, stats_pload->buf[SMC_BUF_512K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, stats_pload->buf[SMC_BUF_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, stats_pload->buf[SMC_BUF_G_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, struct smc_stats *stats, int tech) { struct smc_stats_tech *smc_tech; struct nlattr *attrs; smc_tech = &stats->smc[tech]; if (tech == SMC_TYPE_D) attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); else attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); if (!attrs) goto errout; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_SIZE)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, smc_tech->clnt_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, smc_tech->clnt_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, smc_tech->srv_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, smc_tech->srv_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, smc_tech->rx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, smc_tech->tx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, smc_tech->rx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, smc_tech->tx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, smc_tech->sendpage_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, smc_tech->cork_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, smc_tech->ndly_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, smc_tech->splice_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, smc_tech->urg_data_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); struct smc_stats *stats; struct nlattr *attrs; int cpu, i, size; void *nlh; u64 *src; u64 *sum; if (cb_ctx->pos[0]) goto errmsg; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_STATS); if (!attrs) goto errnest; stats = kzalloc(sizeof(*stats), GFP_KERNEL); if (!stats) goto erralloc; size = sizeof(*stats) / sizeof(u64); for_each_possible_cpu(cpu) { src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); sum = (u64 *)stats; for (i = 0; i < size; i++) *(sum++) += *(src++); } if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) goto errattr; if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, stats->clnt_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, stats->srv_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); cb_ctx->pos[0] = 1; kfree(stats); return skb->len; errattr: kfree(stats); erralloc: nla_nest_cancel(skb, attrs); errnest: genlmsg_cancel(skb, nlh); errmsg: return skb->len; } static int smc_nl_get_fback_details(struct sk_buff *skb, struct netlink_callback *cb, int pos, bool is_srv) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int cnt_reported = cb_ctx->pos[2]; struct smc_stats_fback *trgt_arr; struct nlattr *attrs; int rc = 0; void *nlh; if (is_srv) trgt_arr = &net->smc.fback_rsn->srv[0]; else trgt_arr = &net->smc.fback_rsn->clnt[0]; if (!trgt_arr[pos].fback_code) return -ENODATA; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_FBACK_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) goto errattr; if (!cnt_reported) { if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, net->smc.fback_rsn->srv_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, net->smc.fback_rsn->clnt_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; cnt_reported = 1; } if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, trgt_arr[pos].fback_code)) goto errattr; if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, trgt_arr[pos].count)) goto errattr; cb_ctx->pos[2] = cnt_reported; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return rc; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int rc_srv = 0, rc_clnt = 0, k; int skip_serv = cb_ctx->pos[1]; int snum = cb_ctx->pos[0]; bool is_srv = true; mutex_lock(&net->smc.mutex_fback_rsn); for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { if (k < snum) continue; if (!skip_serv) { rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); if (rc_srv && rc_srv != -ENODATA) break; } else { skip_serv = 0; } rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); if (rc_clnt && rc_clnt != -ENODATA) { skip_serv = 1; break; } if (rc_clnt == -ENODATA && rc_srv == -ENODATA) break; } mutex_unlock(&net->smc.mutex_fback_rsn); cb_ctx->pos[1] = skip_serv; cb_ctx->pos[0] = k; return skb->len; }
6 6 6 6 903 895 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* Core of can-j1939 that links j1939 to CAN. */ #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/if_arp.h> #include <linux/module.h> #include "j1939-priv.h" MODULE_DESCRIPTION("PF_CAN SAE J1939"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)"); MODULE_ALIAS("can-proto-" __stringify(CAN_J1939)); /* LOWLEVEL CAN interface */ /* CAN_HDR: #bytes before can_frame data part */ #define J1939_CAN_HDR (offsetof(struct can_frame, data)) /* lowest layer */ static void j1939_can_recv(struct sk_buff *iskb, void *data) { struct j1939_priv *priv = data; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb, *iskcb; struct can_frame *cf; /* create a copy of the skb * j1939 only delivers the real data bytes, * the header goes into sockaddr. * j1939 may not touch the incoming skb in such way */ skb = skb_clone(iskb, GFP_ATOMIC); if (!skb) return; j1939_priv_get(priv); can_skb_set_owner(skb, iskb->sk); /* get a pointer to the header of the skb * the skb payload (pointer) is moved, so that the next skb_data * returns the actual payload */ cf = (void *)skb->data; skb_pull(skb, J1939_CAN_HDR); /* fix length, set to dlc, with 8 maximum */ skb_trim(skb, min_t(uint8_t, cf->len, 8)); /* set addr */ skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); iskcb = j1939_skb_to_cb(iskb); skcb->tskey = iskcb->tskey; skcb->priority = (cf->can_id >> 26) & 0x7; skcb->addr.sa = cf->can_id; skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; if (!j1939_address_is_valid(skcb->addr.sa)) { netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", __func__); goto done; } if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; /* normalize pgn: strip dst address */ skcb->addr.pgn &= 0x3ff00; } else { /* set broadcast address */ skcb->addr.da = J1939_NO_ADDR; } /* update localflags */ read_lock_bh(&priv->lock); if (j1939_address_is_unicast(skcb->addr.sa) && priv->ents[skcb->addr.sa].nusers) skcb->flags |= J1939_ECU_LOCAL_SRC; if (j1939_address_is_unicast(skcb->addr.da) && priv->ents[skcb->addr.da].nusers) skcb->flags |= J1939_ECU_LOCAL_DST; read_unlock_bh(&priv->lock); /* deliver into the j1939 stack ... */ j1939_ac_recv(priv, skb); if (j1939_tp_recv(priv, skb)) /* this means the transport layer processed the message */ goto done; j1939_simple_recv(priv, skb); j1939_sk_recv(priv, skb); done: j1939_priv_put(priv); kfree_skb(skb); } /* NETDEV MANAGEMENT */ /* values for can_rx_(un)register */ #define J1939_CAN_ID CAN_EFF_FLAG #define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG) static DEFINE_MUTEX(j1939_netdev_lock); static struct j1939_priv *j1939_priv_create(struct net_device *ndev) { struct j1939_priv *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return NULL; rwlock_init(&priv->lock); INIT_LIST_HEAD(&priv->ecus); priv->ndev = ndev; kref_init(&priv->kref); kref_init(&priv->rx_kref); dev_hold(ndev); netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv); return priv; } static inline void j1939_priv_set(struct net_device *ndev, struct j1939_priv *priv) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); can_ml->j1939_priv = priv; } static void __j1939_priv_release(struct kref *kref) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref); struct net_device *ndev = priv->ndev; netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv); WARN_ON_ONCE(!list_empty(&priv->active_session_list)); WARN_ON_ONCE(!list_empty(&priv->ecus)); WARN_ON_ONCE(!list_empty(&priv->j1939_socks)); dev_put(ndev); kfree(priv); } void j1939_priv_put(struct j1939_priv *priv) { kref_put(&priv->kref, __j1939_priv_release); } void j1939_priv_get(struct j1939_priv *priv) { kref_get(&priv->kref); } static int j1939_can_rx_register(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; int ret; j1939_priv_get(priv); ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv, "j1939", NULL); if (ret < 0) { j1939_priv_put(priv); return ret; } return 0; } static void j1939_can_rx_unregister(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv); /* The last reference of priv is dropped by the RCU deferred * j1939_sk_sock_destruct() of the last socket, so we can * safely drop this reference here. */ j1939_priv_put(priv); } static void __j1939_rx_release(struct kref *kref) __releases(&j1939_netdev_lock) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, rx_kref); j1939_can_rx_unregister(priv); j1939_ecu_unmap_all(priv); j1939_priv_set(priv->ndev, NULL); mutex_unlock(&j1939_netdev_lock); } /* get pointer to priv without increasing ref counter */ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); return can_ml->j1939_priv; } static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) { struct j1939_priv *priv; lockdep_assert_held(&j1939_netdev_lock); priv = j1939_ndev_to_priv(ndev); if (priv) j1939_priv_get(priv); return priv; } static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev) { struct j1939_priv *priv; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); mutex_unlock(&j1939_netdev_lock); return priv; } struct j1939_priv *j1939_netdev_start(struct net_device *ndev) { struct j1939_priv *priv, *priv_new; int ret; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); mutex_unlock(&j1939_netdev_lock); return priv; } mutex_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) return ERR_PTR(-ENOMEM); j1939_tp_init(priv); rwlock_init(&priv->j1939_socks_lock); INIT_LIST_HEAD(&priv->j1939_socks); mutex_lock(&j1939_netdev_lock); priv_new = j1939_priv_get_by_ndev_locked(ndev); if (priv_new) { /* Someone was faster than us, use their priv and roll * back our's. */ kref_get(&priv_new->rx_kref); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return priv_new; } j1939_priv_set(ndev, priv); ret = j1939_can_rx_register(priv); if (ret < 0) goto out_priv_put; mutex_unlock(&j1939_netdev_lock); return priv; out_priv_put: j1939_priv_set(ndev, NULL); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return ERR_PTR(ret); } void j1939_netdev_stop(struct j1939_priv *priv) { kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock); j1939_priv_put(priv); } int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) { int ret, dlc; canid_t canid; struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct can_frame *cf; /* apply sanity checks */ if (j1939_pgn_is_pdu1(skcb->addr.pgn)) skcb->addr.pgn &= J1939_PGN_PDU1_MAX; else skcb->addr.pgn &= J1939_PGN_MAX; if (skcb->priority > 7) skcb->priority = 6; ret = j1939_ac_fixup(priv, skb); if (unlikely(ret)) goto failed; dlc = skb->len; /* re-claim the CAN_HDR from the SKB */ cf = skb_push(skb, J1939_CAN_HDR); /* initialize header structure */ memset(cf, 0, J1939_CAN_HDR); /* make it a full can frame again */ skb_put_zero(skb, 8 - dlc); canid = CAN_EFF_FLAG | (skcb->priority << 26) | (skcb->addr.pgn << 8) | skcb->addr.sa; if (j1939_pgn_is_pdu1(skcb->addr.pgn)) canid |= skcb->addr.da << 8; cf->can_id = canid; cf->len = dlc; return can_send(skb, 1); failed: kfree_skb(skb); return ret; } static int j1939_netdev_notify(struct notifier_block *nb, unsigned long msg, void *data) { struct net_device *ndev = netdev_notifier_info_to_dev(data); struct can_ml_priv *can_ml = can_get_ml_priv(ndev); struct j1939_priv *priv; if (!can_ml) goto notify_done; priv = j1939_priv_get_by_ndev(ndev); if (!priv) goto notify_done; switch (msg) { case NETDEV_DOWN: j1939_cancel_active_session(priv, NULL); j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; } j1939_priv_put(priv); notify_done: return NOTIFY_DONE; } static struct notifier_block j1939_netdev_notifier = { .notifier_call = j1939_netdev_notify, }; /* MODULE interface */ static __init int j1939_module_init(void) { int ret; pr_info("can: SAE J1939\n"); ret = register_netdevice_notifier(&j1939_netdev_notifier); if (ret) goto fail_notifier; ret = can_proto_register(&j1939_can_proto); if (ret < 0) { pr_err("can: registration of j1939 protocol failed\n"); goto fail_sk; } return 0; fail_sk: unregister_netdevice_notifier(&j1939_netdev_notifier); fail_notifier: return ret; } static __exit void j1939_module_exit(void) { can_proto_unregister(&j1939_can_proto); unregister_netdevice_notifier(&j1939_netdev_notifier); } module_init(j1939_module_init); module_exit(j1939_module_exit);
2 2 2 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/output.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/inet_sock.h> #include <net/sock.h> #include "ackvec.h" #include "ccid.h" #include "dccp.h" static inline void dccp_event_ack_sent(struct sock *sk) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* enqueue @skb on sk_send_head for retransmission, return clone to send now */ static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) { skb_set_owner_w(skb, sk); WARN_ON(sk->sk_send_head); sk->sk_send_head = skb; return skb_clone(sk->sk_send_head, gfp_any()); } /* * All SKB's seen here are completely headerless. It is our * job to build the DCCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. */ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (likely(skb != NULL)) { struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); struct dccp_hdr *dh; /* XXX For now we're using only 48 bits sequence numbers */ const u32 dccp_header_size = sizeof(*dh) + sizeof(struct dccp_hdr_ext) + dccp_packet_hdr_len(dcb->dccpd_type); int err, set_ack = 1; u64 ackno = dp->dccps_gsr; /* * Increment GSS here already in case the option code needs it. * Update GSS for real only if option processing below succeeds. */ dcb->dccpd_seq = ADD48(dp->dccps_gss, 1); switch (dcb->dccpd_type) { case DCCP_PKT_DATA: set_ack = 0; fallthrough; case DCCP_PKT_DATAACK: case DCCP_PKT_RESET: break; case DCCP_PKT_REQUEST: set_ack = 0; /* Use ISS on the first (non-retransmitted) Request. */ if (icsk->icsk_retransmits == 0) dcb->dccpd_seq = dp->dccps_iss; fallthrough; case DCCP_PKT_SYNC: case DCCP_PKT_SYNCACK: ackno = dcb->dccpd_ack_seq; fallthrough; default: /* * Set owner/destructor: some skbs are allocated via * alloc_skb (e.g. when retransmission may happen). * Only Data, DataAck, and Reset packets should come * through here with skb->sk set. */ WARN_ON(skb->sk); skb_set_owner_w(skb, sk); break; } if (dccp_insert_options(sk, skb)) { kfree_skb(skb); return -EPROTO; } /* Build DCCP header and checksum it. */ dh = dccp_zeroed_hdr(skb, dccp_header_size); dh->dccph_type = dcb->dccpd_type; dh->dccph_sport = inet->inet_sport; dh->dccph_dport = inet->inet_dport; dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; dh->dccph_ccval = dcb->dccpd_ccval; dh->dccph_cscov = dp->dccps_pcslen; /* XXX For now we're using only 48 bits sequence numbers */ dh->dccph_x = 1; dccp_update_gss(sk, dcb->dccpd_seq); dccp_hdr_set_seq(dh, dp->dccps_gss); if (set_ack) dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); switch (dcb->dccpd_type) { case DCCP_PKT_REQUEST: dccp_hdr_request(skb)->dccph_req_service = dp->dccps_service; /* * Limit Ack window to ISS <= P.ackno <= GSS, so that * only Responses to Requests we sent are considered. */ dp->dccps_awl = dp->dccps_iss; break; case DCCP_PKT_RESET: dccp_hdr_reset(skb)->dccph_reset_code = dcb->dccpd_reset_code; break; } icsk->icsk_af_ops->send_check(sk, skb); if (set_ack) dccp_event_ack_sent(sk); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); return net_xmit_eval(err); } return -ENOBUFS; } /** * dccp_determine_ccmps - Find out about CCID-specific packet-size limits * @dp: socket to find packet size limits of * * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), * since the RX CCID is restricted to feedback packets (Acks), which are small * in comparison with the data traffic. A value of 0 means "no current CCMPS". */ static u32 dccp_determine_ccmps(const struct dccp_sock *dp) { const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) return 0; return tx_ccid->ccid_ops->ccid_ccmps; } unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); u32 ccmps = dccp_determine_ccmps(dp); u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; /* Account for header lengths and IPv4/v6 option overhead */ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* * Leave enough headroom for common DCCP header options. * This only considers options which may appear on DCCP-Data packets, as * per table 3 in RFC 4340, 5.8. When running out of space for other * options (eg. Ack Vector which can take up to 255 bytes), it is better * to schedule a separate Ack. Thus we leave headroom for the following: * - 1 byte for Slow Receiver (11.6) * - 6 bytes for Timestamp (13.1) * - 10 bytes for Timestamp Echo (13.3) * - 8 bytes for NDP count (7.7, when activated) * - 6 bytes for Data Checksum (9.3) * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) */ cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; WRITE_ONCE(dp->dccps_mss_cache, cur_mps); return cur_mps; } EXPORT_SYMBOL_GPL(dccp_sync_mss); void dccp_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } /** * dccp_wait_for_ccid - Await CCID send permission * @sk: socket to wait for * @delay: timeout in jiffies * * This is used by CCIDs which need to delay the send time in process context. */ static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) { DEFINE_WAIT(wait); long remaining; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; release_sock(sk); remaining = schedule_timeout(delay); lock_sock(sk); sk->sk_write_pending--; finish_wait(sk_sleep(sk), &wait); if (signal_pending(current) || sk->sk_err) return -1; return remaining; } /** * dccp_xmit_packet - Send data packet under control of CCID * @sk: socket to send data packet on * * Transmits next-queued payload and informs CCID to account for the packet. */ static void dccp_xmit_packet(struct sock *sk) { int err, len; struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb = dccp_qpolicy_pop(sk); if (unlikely(skb == NULL)) return; len = skb->len; if (sk->sk_state == DCCP_PARTOPEN) { const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; /* * See 8.1.5 - Handshake Completion. * * For robustness we resend Confirm options until the client has * entered OPEN. During the initial feature negotiation, the MPS * is smaller than usual, reduced by the Change/Confirm options. */ if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { DCCP_WARN("Payload too large (%d) for featneg.\n", len); dccp_send_ack(sk); dccp_feat_list_purge(&dp->dccps_featneg); } inet_csk_schedule_ack(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; } else if (dccp_ack_pending(sk)) { DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; } else { DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; } err = dccp_transmit_skb(sk, skb); if (err) dccp_pr_debug("transmit_skb() returned err=%d\n", err); /* * Register this one as sent even if an error occurred. To the remote * end a local packet drop is indistinguishable from network loss, i.e. * any local drop will eventually be reported via receiver feedback. */ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); /* * If the CCID needs to transfer additional header options out-of-band * (e.g. Ack Vectors or feature-negotiation options), it activates this * flag to schedule a Sync. The Sync will automatically incorporate all * currently pending header options, thus clearing the backlog. */ if (dp->dccps_sync_scheduled) dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); } /** * dccp_flush_write_queue - Drain queue at end of connection * @sk: socket to be drained * @time_budget: time allowed to drain the queue * * Since dccp_sendmsg queues packets without waiting for them to be sent, it may * happen that the TX queue is not empty at the end of a connection. We give the * HC-sender CCID a grace period of up to @time_budget jiffies. If this function * returns with a non-empty write queue, it will be purged later. */ void dccp_flush_write_queue(struct sock *sk, long *time_budget) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; long delay, rc; while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); switch (ccid_packet_dequeue_eval(rc)) { case CCID_PACKET_WILL_DEQUEUE_LATER: /* * If the CCID determines when to send, the next sending * time is unknown or the CCID may not even send again * (e.g. remote host crashes or lost Ack packets). */ DCCP_WARN("CCID did not manage to send all packets\n"); return; case CCID_PACKET_DELAY: delay = msecs_to_jiffies(rc); if (delay > *time_budget) return; rc = dccp_wait_for_ccid(sk, delay); if (rc < 0) return; *time_budget -= (delay - rc); /* check again if we can send now */ break; case CCID_PACKET_SEND_AT_ONCE: dccp_xmit_packet(sk); break; case CCID_PACKET_ERR: skb_dequeue(&sk->sk_write_queue); kfree_skb(skb); dccp_pr_debug("packet discarded due to err=%ld\n", rc); } } } void dccp_write_xmit(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; while ((skb = dccp_qpolicy_top(sk))) { int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); switch (ccid_packet_dequeue_eval(rc)) { case CCID_PACKET_WILL_DEQUEUE_LATER: return; case CCID_PACKET_DELAY: sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies + msecs_to_jiffies(rc)); return; case CCID_PACKET_SEND_AT_ONCE: dccp_xmit_packet(sk); break; case CCID_PACKET_ERR: dccp_qpolicy_drop(sk, skb); dccp_pr_debug("packet discarded due to err=%d\n", rc); } } } /** * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets * @sk: socket to perform retransmit on * * There are only four retransmittable packet types in DCCP: * - Request in client-REQUEST state (sec. 8.1.1), * - CloseReq in server-CLOSEREQ state (sec. 8.3), * - Close in node-CLOSING state (sec. 8.3), * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()). * This function expects sk->sk_send_head to contain the original skb. */ int dccp_retransmit_skb(struct sock *sk) { WARN_ON(sk->sk_send_head == NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) return -EHOSTUNREACH; /* Routing failure or similar. */ /* this count is used to distinguish original and retransmitted skb */ inet_csk(sk)->icsk_retransmits++; return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); } struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, struct request_sock *req) { struct dccp_hdr *dh; struct dccp_request_sock *dreq; const u32 dccp_header_size = sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext) + sizeof(struct dccp_hdr_response); struct sk_buff *skb; /* sk is marked const to clearly express we dont hold socket lock. * sock_wmalloc() will atomically change sk->sk_wmem_alloc, * it is safe to promote sk to non const. */ skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, MAX_DCCP_HEADER); skb_dst_set(skb, dst_clone(dst)); dreq = dccp_rsk(req); if (inet_rsk(req)->acked) /* increase GSS upon retransmission */ dccp_inc_seqno(&dreq->dreq_gss); DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss; /* Resolve feature dependencies resulting from choice of CCID */ if (dccp_feat_server_ccid_dependencies(dreq)) goto response_failed; if (dccp_insert_options_rsk(dreq, skb)) goto response_failed; /* Build and checksum header */ dh = dccp_zeroed_hdr(skb, dccp_header_size); dh->dccph_sport = htons(inet_rsk(req)->ir_num); dh->dccph_dport = inet_rsk(req)->ir_rmt_port; dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; dh->dccph_type = DCCP_PKT_RESPONSE; dh->dccph_x = 1; dccp_hdr_set_seq(dh, dreq->dreq_gss); dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr); dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service; dccp_csum_outgoing(skb); /* We use `acked' to remember that a Response was already sent. */ inet_rsk(req)->acked = 1; DCCP_INC_STATS(DCCP_MIB_OUTSEGS); return skb; response_failed: kfree_skb(skb); return NULL; } EXPORT_SYMBOL_GPL(dccp_make_response); /* answer offending packet in @rcv_skb with Reset from control socket @ctl */ struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb) { struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh; struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb); const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext) + sizeof(struct dccp_hdr_reset); struct dccp_hdr_reset *dhr; struct sk_buff *skb; skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); if (skb == NULL) return NULL; skb_reserve(skb, sk->sk_prot->max_header); /* Swap the send and the receive. */ dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len); dh->dccph_type = DCCP_PKT_RESET; dh->dccph_sport = rxdh->dccph_dport; dh->dccph_dport = rxdh->dccph_sport; dh->dccph_doff = dccp_hdr_reset_len / 4; dh->dccph_x = 1; dhr = dccp_hdr_reset(skb); dhr->dccph_reset_code = dcb->dccpd_reset_code; switch (dcb->dccpd_reset_code) { case DCCP_RESET_CODE_PACKET_ERROR: dhr->dccph_reset_data[0] = rxdh->dccph_type; break; case DCCP_RESET_CODE_OPTION_ERROR: case DCCP_RESET_CODE_MANDATORY_ERROR: memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3); break; } /* * From RFC 4340, 8.3.1: * If P.ackno exists, set R.seqno := P.ackno + 1. * Else set R.seqno := 0. */ if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1)); dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq); dccp_csum_outgoing(skb); return skb; } EXPORT_SYMBOL_GPL(dccp_ctl_make_reset); /* send Reset on established socket, to close or abort the connection */ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) { struct sk_buff *skb; /* * FIXME: what if rebuild_header fails? * Should we be doing a rebuild_header here? */ int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); if (err != 0) return err; skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC); if (skb == NULL) return -ENOBUFS; /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, sk->sk_prot->max_header); DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; DCCP_SKB_CB(skb)->dccpd_reset_code = code; return dccp_transmit_skb(sk, skb); } /* * Do all connect socket setups that can be done AF independent. */ int dccp_connect(struct sock *sk) { struct sk_buff *skb; struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); dccp_sync_mss(sk, dst_mtu(dst)); /* do not connect if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dccp_sk(sk))) return -EPROTO; /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ dp->dccps_gar = dp->dccps_iss; skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); if (unlikely(skb == NULL)) return -ENOBUFS; /* Reserve space for headers. */ skb_reserve(skb, sk->sk_prot->max_header); DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ icsk->icsk_retransmits = 0; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); return 0; } EXPORT_SYMBOL_GPL(dccp_connect); void dccp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ if (sk->sk_state != DCCP_CLOSED) { struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); if (skb == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, DCCP_RTO_MAX); return; } /* Reserve space for headers */ skb_reserve(skb, sk->sk_prot->max_header); DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; dccp_transmit_skb(sk, skb); } } EXPORT_SYMBOL_GPL(dccp_send_ack); #if 0 /* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */ void dccp_send_delayed_ack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* * FIXME: tune this timer. elapsed time fixes the skew, so no problem * with using 2s, and active senders also piggyback the ACK into a * DATAACK packet, so this is really for quiescent senders. */ unsigned long timeout = jiffies + 2 * HZ; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer was blocked or is about to expire, * send ACK now. * * FIXME: check the "about to expire" part */ if (icsk->icsk_ack.blocked) { dccp_send_ack(sk); return; } if (!time_before(timeout, icsk->icsk_ack.timeout)) timeout = icsk->icsk_ack.timeout; } icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } #endif void dccp_send_sync(struct sock *sk, const u64 ackno, const enum dccp_pkt_type pkt_type) { /* * We are not putting this on the write queue, so * dccp_transmit_skb() will set the ownership to this * sock. */ struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); if (skb == NULL) { /* FIXME: how to make sure the sync is sent? */ DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type)); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, sk->sk_prot->max_header); DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; /* * Clear the flag in case the Sync was scheduled for out-of-band data, * such as carrying a long Ack Vector. */ dccp_sk(sk)->dccps_sync_scheduled = 0; dccp_transmit_skb(sk, skb); } EXPORT_SYMBOL_GPL(dccp_send_sync); /* * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under * any circumstances. */ void dccp_send_close(struct sock *sk, const int active) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; skb = alloc_skb(sk->sk_prot->max_header, prio); if (skb == NULL) return; /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, sk->sk_prot->max_header); if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; else DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { skb = dccp_skb_entail(sk, skb); /* * Retransmission timer for active-close: RFC 4340, 8.3 requires * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ * state can be left. The initial timeout is 2 RTTs. * Since RTT measurement is done by the CCIDs, there is no easy * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 * is too low (200ms); we use a high value to avoid unnecessary * retransmissions when the link RTT is > 0.2 seconds. * FIXME: Let main module sample RTTs and use that instead. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); } dccp_transmit_skb(sk, skb); }
62 63 222 3 2026 82 85 8 67 45 1529 11 2018 2028 1331 2414 2889 158 504 3 47 239 187 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> struct kernel_clone_args; #ifdef CONFIG_CGROUPS /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 /* walk only threadgroup leaders */ #define CSS_TASK_ITER_PROCS (1U << 0) /* walk all threaded css_sets in the domain */ #define CSS_TASK_ITER_THREADED (1U << 1) /* internal flags */ #define CSS_TASK_ITER_SKIPPED (1U << 16) /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_get - obtain a reference on the specified css * @css: target css * * The caller must already have a reference. */ static inline void css_get(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } /** * css_get_many - obtain references on the specified css * @css: target css * @n: number of references to get * * The caller must already have a reference. */ static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } /** * css_tryget - try to obtain a reference on the specified css * @css: target css * * Obtain a reference on @css unless it already has reached zero and is * being released. This function doesn't care whether @css is on or * offline. The caller naturally needs to ensure that @css is accessible * but doesn't have to be holding a reference on it - IOW, RCU protected * access is good enough for this function. Returns %true if a reference * count was successfully obtained; %false otherwise. */ static inline bool css_tryget(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; } /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * * Obtain a reference on @css if it's online. The caller naturally needs * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ static inline bool css_tryget_online(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } /** * css_put - put a css reference * @css: target css * * Put a reference obtained via css_get() and css_tryget_online(). */ static inline void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } /** * css_put_many - put css references * @css: target css * @n: number of references to put * * Put references obtained via css_get() and css_tryget_online(). */ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (cgrp->level < ancestor_level) return NULL; while (cgrp && cgrp->level > ancestor_level) cgrp = cgroup_parent(cgrp); return cgrp; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { return &cgrp->psi; } bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { return NULL; } static inline bool cgroup_psi_enabled(void) { return false; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} static inline struct cgroup *cgroup_get_from_id(u64 id) { return NULL; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); rcu_read_lock(); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); rcu_read_unlock(); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; struct css_set *root_cset; }; extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; } #endif /* !CONFIG_CGROUPS */ static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) free_cgroup_ns(ns); } #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ #endif /* _LINUX_CGROUP_H */
43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 // SPDX-License-Identifier: GPL-2.0-or-later /* AFS client file system * * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/sched.h> #include <linux/random.h> #include <linux/proc_fs.h> #define CREATE_TRACE_POINTS #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); unsigned afs_debug; module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(debug, "AFS debugging mask"); static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); struct workqueue_struct *afs_wq; static struct proc_dir_entry *afs_proc_symlink; #if defined(CONFIG_ALPHA) const char afs_init_sysname[] = "alpha_linux26"; #elif defined(CONFIG_X86_64) const char afs_init_sysname[] = "amd64_linux26"; #elif defined(CONFIG_ARM) const char afs_init_sysname[] = "arm_linux26"; #elif defined(CONFIG_ARM64) const char afs_init_sysname[] = "aarch64_linux26"; #elif defined(CONFIG_X86_32) const char afs_init_sysname[] = "i386_linux26"; #elif defined(CONFIG_IA64) const char afs_init_sysname[] = "ia64_linux26"; #elif defined(CONFIG_PPC64) const char afs_init_sysname[] = "ppc64_linux26"; #elif defined(CONFIG_PPC32) const char afs_init_sysname[] = "ppc_linux26"; #elif defined(CONFIG_S390) #ifdef CONFIG_64BIT const char afs_init_sysname[] = "s390x_linux26"; #else const char afs_init_sysname[] = "s390_linux26"; #endif #elif defined(CONFIG_SPARC64) const char afs_init_sysname[] = "sparc64_linux26"; #elif defined(CONFIG_SPARC32) const char afs_init_sysname[] = "sparc_linux26"; #else const char afs_init_sysname[] = "unknown_linux26"; #endif /* * Initialise an AFS network namespace record. */ static int __net_init afs_net_init(struct net *net_ns) { struct afs_sysnames *sysnames; struct afs_net *net = afs_net(net_ns); int ret; net->net = net_ns; net->live = true; generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; init_rwsem(&net->cells_lock); INIT_WORK(&net->cells_manager, afs_manage_cells); timer_setup(&net->cells_timer, afs_cells_timer, 0); mutex_init(&net->cells_alias_lock); mutex_init(&net->proc_cells_lock); INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); net->fs_servers = RB_ROOT; INIT_LIST_HEAD(&net->fs_probe_fast); INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); INIT_HLIST_HEAD(&net->fs_addresses4); INIT_HLIST_HEAD(&net->fs_addresses6); seqlock_init(&net->fs_addr_lock); INIT_WORK(&net->fs_manager, afs_manage_servers); timer_setup(&net->fs_timer, afs_servers_timer, 0); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); atomic_set(&net->servers_outstanding, 1); ret = -ENOMEM; sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); if (!sysnames) goto error_sysnames; sysnames->subs[0] = (char *)&afs_init_sysname; sysnames->nr = 1; refcount_set(&sysnames->usage, 1); net->sysnames = sysnames; rwlock_init(&net->sysnames_lock); /* Register the /proc stuff */ ret = afs_proc_init(net); if (ret < 0) goto error_proc; /* Initialise the cell DB */ ret = afs_cell_init(net, rootcell); if (ret < 0) goto error_cell_init; /* Create the RxRPC transport */ ret = afs_open_socket(net); if (ret < 0) goto error_open_socket; return 0; error_open_socket: net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_purge_servers(net); error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: afs_put_sysnames(net->sysnames); error_sysnames: net->live = false; return ret; } /* * Clean up and destroy an AFS network namespace record. */ static void __net_exit afs_net_exit(struct net *net_ns) { struct afs_net *net = afs_net(net_ns); net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_purge_servers(net); afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); } static struct pernet_operations afs_net_ops = { .init = afs_net_init, .exit = afs_net_exit, .id = &afs_net_id, .size = sizeof(struct afs_net), }; /* * initialise the AFS client FS module */ static int __init afs_init(void) { int ret = -ENOMEM; printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); afs_wq = alloc_workqueue("afs", 0, 0); if (!afs_wq) goto error_afs_wq; afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); if (!afs_async_calls) goto error_async; afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); if (!afs_lock_manager) goto error_lockmgr; #ifdef CONFIG_AFS_FSCACHE /* we want to be able to cache */ ret = fscache_register_netfs(&afs_cache_netfs); if (ret < 0) goto error_cache; #endif ret = register_pernet_device(&afs_net_ops); if (ret < 0) goto error_net; /* register the filesystems */ ret = afs_fs_init(); if (ret < 0) goto error_fs; afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs"); if (!afs_proc_symlink) { ret = -ENOMEM; goto error_proc; } return ret; error_proc: afs_fs_exit(); error_fs: unregister_pernet_device(&afs_net_ops); error_net: #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); error_cache: #endif destroy_workqueue(afs_lock_manager); error_lockmgr: destroy_workqueue(afs_async_calls); error_async: destroy_workqueue(afs_wq); error_afs_wq: rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); return ret; } /* XXX late_initcall is kludgy, but the only alternative seems to create * a transport upon the first mount, which is worse. Or is it? */ late_initcall(afs_init); /* must be called after net/ to create socket */ /* * clean up on module removal */ static void __exit afs_exit(void) { printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); proc_remove(afs_proc_symlink); afs_fs_exit(); unregister_pernet_device(&afs_net_ops); #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); #endif destroy_workqueue(afs_lock_manager); destroy_workqueue(afs_async_calls); destroy_workqueue(afs_wq); afs_clean_up_permit_cache(); rcu_barrier(); } module_exit(afs_exit);
9 6 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 #include <linux/errno.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ip.h> #include <net/ip6_fib.h> #include <net/lwtunnel.h> #include <net/protocol.h> #include <uapi/linux/ila.h> #include "ila.h" void ila_init_saved_csum(struct ila_params *p) { if (!p->locator_match.v64) return; p->csum_diff = compute_csum_diff8( (__be32 *)&p->locator, (__be32 *)&p->locator_match); } static __wsum get_csum_diff_iaddr(struct ila_addr *iaddr, struct ila_params *p) { if (p->locator_match.v64) return p->csum_diff; else return compute_csum_diff8((__be32 *)&p->locator, (__be32 *)&iaddr->loc); } static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) { return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p); } static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr, struct ila_params *p) { __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff, fval; diff = get_csum_diff_iaddr(iaddr, p); fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); diff = csum_add(diff, fval); *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); /* Flip the csum-neutral bit. Either we are doing a SIR->ILA * translation with ILA_CSUM_NEUTRAL_MAP as the csum_method * and the C-bit is not set, or we are doing an ILA-SIR * tranlsation and the C-bit is set. */ iaddr->ident.csum_neutral ^= 1; } static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr, struct ila_params *p) { __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff; diff = get_csum_diff_iaddr(iaddr, p); *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); } static void ila_csum_adjust_transport(struct sk_buff *skb, struct ila_params *p) { size_t nhoff = sizeof(struct ipv6hdr); struct ipv6hdr *ip6h = ipv6_hdr(skb); __wsum diff; switch (ip6h->nexthdr) { case NEXTHDR_TCP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { struct tcphdr *th = (struct tcphdr *) (skb_network_header(skb) + nhoff); diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&th->check, skb, diff, true); } break; case NEXTHDR_UDP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) { struct udphdr *uh = (struct udphdr *) (skb_network_header(skb) + nhoff); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&uh->check, skb, diff, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } } break; case NEXTHDR_ICMP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct icmp6hdr)))) { struct icmp6hdr *ih = (struct icmp6hdr *) (skb_network_header(skb) + nhoff); diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, diff, true); } break; } } void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, bool sir2ila) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); switch (p->csum_mode) { case ILA_CSUM_ADJUST_TRANSPORT: ila_csum_adjust_transport(skb, p); break; case ILA_CSUM_NEUTRAL_MAP: if (sir2ila) { if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) { /* Checksum flag should never be * set in a formatted SIR address. */ break; } } else if (!ila_csum_neutral_set(iaddr->ident)) { /* ILA to SIR translation and C-bit isn't * set so we're good. */ break; } ila_csum_do_neutral_fmt(iaddr, p); break; case ILA_CSUM_NEUTRAL_MAP_AUTO: ila_csum_do_neutral_nofmt(iaddr, p); break; case ILA_CSUM_NO_ACTION: break; } /* Now change destination address */ iaddr->loc = p->locator; }
69 69 22 22 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for logging packets to userspace via * nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ipt_ULOG.c: * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/netfilter_bridge.h> #include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/spinlock.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/list.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> #include <net/netns/generic.h> #include <linux/atomic.h> #include <linux/refcount.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ /* max packet size is limited by 16-bit struct nfattr nfa_len field */ #define NFULNL_COPY_RANGE_MAX (0xFFFF - NLA_HDRLEN) #define PRINTR(x, args...) do { if (net_ratelimit()) \ printk(x, ## args); } while (0); struct nfulnl_instance { struct hlist_node hlist; /* global list of instances */ spinlock_t lock; refcount_t use; /* use count */ unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; struct net *net; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ u32 peer_portid; /* PORTID of the peer process */ /* configurable parameters */ unsigned int flushtimeout; /* timeout until queue flush */ unsigned int nlbufsiz; /* netlink buffer allocation size */ unsigned int qthreshold; /* threshold of the queue */ u_int32_t copy_range; u_int32_t seq; /* instance-local sequential counter */ u_int16_t group_num; /* number of this queue */ u_int16_t flags; u_int8_t copy_mode; struct rcu_head rcu; }; #define INSTANCE_BUCKETS 16 static unsigned int nfnl_log_net_id __read_mostly; struct nfnl_log_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; atomic_t global_seq; }; static struct nfnl_log_net *nfnl_log_pernet(struct net *net) { return net_generic(net, nfnl_log_net_id); } static inline u_int8_t instance_hashfn(u_int16_t group_num) { return ((group_num & 0xff) % INSTANCE_BUCKETS); } static struct nfulnl_instance * __instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) { struct hlist_head *head; struct nfulnl_instance *inst; head = &log->instance_table[instance_hashfn(group_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->group_num == group_num) return inst; } return NULL; } static inline void instance_get(struct nfulnl_instance *inst) { refcount_inc(&inst->use); } static struct nfulnl_instance * instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) { struct nfulnl_instance *inst; rcu_read_lock_bh(); inst = __instance_lookup(log, group_num); if (inst && !refcount_inc_not_zero(&inst->use)) inst = NULL; rcu_read_unlock_bh(); return inst; } static void nfulnl_instance_free_rcu(struct rcu_head *head) { struct nfulnl_instance *inst = container_of(head, struct nfulnl_instance, rcu); put_net(inst->net); kfree(inst); module_put(THIS_MODULE); } static void instance_put(struct nfulnl_instance *inst) { if (inst && refcount_dec_and_test(&inst->use)) call_rcu(&inst->rcu, nfulnl_instance_free_rcu); } static void nfulnl_timer(struct timer_list *t); static struct nfulnl_instance * instance_create(struct net *net, u_int16_t group_num, u32 portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; struct nfnl_log_net *log = nfnl_log_pernet(net); int err; spin_lock_bh(&log->instances_lock); if (__instance_lookup(log, group_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } if (!try_module_get(THIS_MODULE)) { kfree(inst); err = -EAGAIN; goto out_unlock; } INIT_HLIST_NODE(&inst->hlist); spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ refcount_set(&inst->use, 2); timer_setup(&inst->timer, nfulnl_timer, 0); inst->net = get_net(net); inst->peer_user_ns = user_ns; inst->peer_portid = portid; inst->group_num = group_num; inst->qthreshold = NFULNL_QTHRESH_DEFAULT; inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; inst->copy_mode = NFULNL_COPY_PACKET; inst->copy_range = NFULNL_COPY_RANGE_MAX; hlist_add_head_rcu(&inst->hlist, &log->instance_table[instance_hashfn(group_num)]); spin_unlock_bh(&log->instances_lock); return inst; out_unlock: spin_unlock_bh(&log->instances_lock); return ERR_PTR(err); } static void __nfulnl_flush(struct nfulnl_instance *inst); /* called with BH disabled */ static void __instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ hlist_del_rcu(&inst->hlist); /* then flush all pending packets from skb */ spin_lock(&inst->lock); /* lockless readers wont be able to use us */ inst->copy_mode = NFULNL_COPY_DISABLED; if (inst->skb) __nfulnl_flush(inst); spin_unlock(&inst->lock); /* and finally put the refcount */ instance_put(inst); } static inline void instance_destroy(struct nfnl_log_net *log, struct nfulnl_instance *inst) { spin_lock_bh(&log->instances_lock); __instance_destroy(inst); spin_unlock_bh(&log->instances_lock); } static int nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, unsigned int range) { int status = 0; spin_lock_bh(&inst->lock); switch (mode) { case NFULNL_COPY_NONE: case NFULNL_COPY_META: inst->copy_mode = mode; inst->copy_range = 0; break; case NFULNL_COPY_PACKET: inst->copy_mode = mode; if (range == 0) range = NFULNL_COPY_RANGE_MAX; inst->copy_range = min_t(unsigned int, range, NFULNL_COPY_RANGE_MAX); break; default: status = -EINVAL; break; } spin_unlock_bh(&inst->lock); return status; } static int nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) { int status; spin_lock_bh(&inst->lock); if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) status = -ERANGE; else if (nlbufsiz > 131072) status = -ERANGE; else { inst->nlbufsiz = nlbufsiz; status = 0; } spin_unlock_bh(&inst->lock); return status; } static void nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) { spin_lock_bh(&inst->lock); inst->flushtimeout = timeout; spin_unlock_bh(&inst->lock); } static void nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) { spin_lock_bh(&inst->lock); inst->qthreshold = qthresh; spin_unlock_bh(&inst->lock); } static int nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) { spin_lock_bh(&inst->lock); inst->flags = flags; spin_unlock_bh(&inst->lock); return 0; } static struct sk_buff * nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; /* alloc skb which should be big enough for a whole multipart * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ skb = alloc_skb(pkt_size, GFP_ATOMIC); } } return skb; } static void __nfulnl_send(struct nfulnl_instance *inst) { if (inst->qlen > 1) { struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0, NLMSG_DONE, sizeof(struct nfgenmsg), 0); if (WARN_ONCE(!nlh, "bad nlskb size: %u, tailroom %d\n", inst->skb->len, skb_tailroom(inst->skb))) { kfree_skb(inst->skb); goto out; } } nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; } static void __nfulnl_flush(struct nfulnl_instance *inst) { /* timer holds a reference */ if (del_timer(&inst->timer)) instance_put(inst); if (inst->skb) __nfulnl_send(inst); } static void nfulnl_timer(struct timer_list *t) { struct nfulnl_instance *inst = from_timer(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) __nfulnl_send(inst); spin_unlock_bh(&inst->lock); instance_put(inst); } static u32 nfulnl_get_bridge_size(const struct sk_buff *skb) { u32 size = 0; if (!skb_mac_header_was_set(skb)) return 0; if (skb_vlan_tag_present(skb)) { size += nla_total_size(0); /* nested */ size += nla_total_size(sizeof(u16)); /* id */ size += nla_total_size(sizeof(u16)); /* tag */ } if (skb->network_header > skb->mac_header) size += nla_total_size(skb->network_header - skb->mac_header); return size; } static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) return 0; if (skb_vlan_tag_present(skb)) { struct nlattr *nest; nest = nla_nest_start(inst->skb, NFULA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) || nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto)) goto nla_put_failure; nla_nest_end(inst->skb, nest); } if (skb->mac_header < skb->network_header) { int len = (int)(skb->network_header - skb->mac_header); if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } /* This is an inline function, we don't really care about a long * list of arguments */ static inline int __build_packet_message(struct nfnl_log_net *log, struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, u_int8_t pf, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, const char *prefix, unsigned int plen, const struct nfnl_ct_hook *nfnl_ct, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), 0, pf, NFNETLINK_V0, htons(inst->group_num)); if (!nlh) return -1; memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; pmsg.hook = hooknum; if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg)) goto nla_put_failure; if (prefix && nla_put(inst->skb, NFULA_PREFIX, plen, prefix)) goto nla_put_failure; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_thresh or * nf_log_packet. */ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { struct net_device *physindev; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physindev = nf_bridge_get_physindev(skb); if (physindev && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(physindev->ifindex))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_thresh or * nf_log_packet. */ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { struct net_device *physoutdev; /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutdev = nf_bridge_get_physoutdev(skb); if (physoutdev && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(physoutdev->ifindex))) goto nla_put_failure; } #endif } if (skb->mark && nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark))) goto nla_put_failure; if (indev && skb->dev && skb_mac_header_was_set(skb) && skb_mac_header_len(skb) != 0) { struct nfulnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(skb, phw.hw_addr); if (len > 0) { phw.hw_addrlen = htons(len); if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (indev && skb_mac_header_was_set(skb)) { if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || nla_put_be16(inst->skb, NFULA_HWLEN, htons(skb->dev->hard_header_len))) goto nla_put_failure; hwhdrp = skb_mac_header(skb); if (skb->dev->type == ARPHRD_SIT) hwhdrp -= ETH_HLEN; if (hwhdrp >= skb->head && nla_put(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, hwhdrp)) goto nla_put_failure; } if (hooknum <= NF_INET_FORWARD && skb->tstamp) { struct nfulnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(skb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } /* UID */ sk = skb->sk; if (sk && sk_fullsock(sk)) { read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { struct file *file = sk->sk_socket->file; const struct cred *cred = file->f_cred; struct user_namespace *user_ns = inst->peer_user_ns; __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid)); __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid)); read_unlock_bh(&sk->sk_callback_lock); if (nla_put_be32(inst->skb, NFULA_UID, uid) || nla_put_be32(inst->skb, NFULA_GID, gid)) goto nla_put_failure; } else read_unlock_bh(&sk->sk_callback_lock); } /* local sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ) && nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++))) goto nla_put_failure; /* global sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, htonl(atomic_inc_return(&log->global_seq)))) goto nla_put_failure; if (ct && nfnl_ct->build(inst->skb, ct, ctinfo, NFULA_CT, NFULA_CT_INFO) < 0) goto nla_put_failure; if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) && nfulnl_put_bridge(inst, skb) < 0) goto nla_put_failure; if (data_len) { struct nlattr *nla; int size = nla_attr_size(data_len); if (skb_tailroom(inst->skb) < nla_total_size(data_len)) goto nla_put_failure; nla = skb_put(inst->skb, nla_total_size(data_len)); nla->nla_type = NFULA_PAYLOAD; nla->nla_len = size; if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) BUG(); } nlh->nlmsg_len = inst->skb->tail - old_tail; return 0; nla_put_failure: PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); return -1; } static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { .ulog = { .copy_len = 0xffff, .group = 0, .qthreshold = 1, }, }, }; /* log handler for internal netfilter logging api */ static void nfulnl_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *li_user, const char *prefix) { size_t size; unsigned int data_len; struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; enum ip_conntrack_info ctinfo = 0; struct nf_conn *ct = NULL; if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; inst = instance_lookup_get(log, li->u.ulog.group); if (!inst) return; if (prefix) plen = strlen(prefix) + 1; /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t)) /* gid */ + nla_total_size(plen) /* prefix */ + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)) + nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */ if (in && skb_mac_header_was_set(skb)) { size += nla_total_size(skb->dev->hard_header_len) + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + nla_total_size(sizeof(u_int16_t)); /* hwlen */ } spin_lock_bh(&inst->lock); if (inst->flags & NFULNL_CFG_F_SEQ) size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (inst->flags & NFULNL_CFG_F_CONNTRACK) { nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) { ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) size += nfulnl_get_bridge_size(skb); qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ if (li->u.ulog.qthreshold) if (qthreshold > li->u.ulog.qthreshold) qthreshold = li->u.ulog.qthreshold; switch (inst->copy_mode) { case NFULNL_COPY_META: case NFULNL_COPY_NONE: data_len = 0; break; case NFULNL_COPY_PACKET: data_len = inst->copy_range; if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) && (li->u.ulog.copy_len < data_len)) data_len = li->u.ulog.copy_len; if (data_len > skb->len) data_len = skb->len; size += nla_total_size(data_len); break; case NFULNL_COPY_DISABLED: default: goto unlock_and_release; } if (inst->skb && size > skb_tailroom(inst->skb)) { /* either the queue len is too high or we don't have * enough room in the skb left. flush to userspace. */ __nfulnl_flush(inst); } if (!inst->skb) { inst->skb = nfulnl_alloc_skb(net, inst->peer_portid, inst->nlbufsiz, size); if (!inst->skb) goto alloc_failure; } inst->qlen++; __build_packet_message(log, inst, skb, data_len, pf, hooknum, in, out, prefix, plen, nfnl_ct, ct, ctinfo); if (inst->qlen >= qthreshold) __nfulnl_flush(inst); /* timer_pending always called within inst->lock, so there * is no chance of a race here */ else if (!timer_pending(&inst->timer)) { instance_get(inst); inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); add_timer(&inst->timer); } unlock_and_release: spin_unlock_bh(&inst->lock); instance_put(inst); return; alloc_failure: /* FIXME: statistics */ goto unlock_and_release; } static int nfulnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_log_net *log = nfnl_log_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock_bh(&log->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfulnl_instance *inst; struct hlist_head *head = &log->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock_bh(&log->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfulnl_rtnl_notifier = { .notifier_call = nfulnl_rcv_nl_event, }; static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { return -ENOTSUPP; } static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", .type = NF_LOG_TYPE_ULOG, .logfn = nfulnl_log_packet, .me = THIS_MODULE, }; static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) }, [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) }, [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 }, [NFULA_CFG_QTHRESH] = { .type = NLA_U32 }, [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 }, [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { struct nfnl_log_net *log = nfnl_log_pernet(info->net); u_int16_t group_num = ntohs(info->nfmsg->res_id); struct nfulnl_msg_config_cmd *cmd = NULL; struct nfulnl_instance *inst; u16 flags = 0; int ret = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = info->nfmsg->nfgen_family; cmd = nla_data(nfula[NFULA_CFG_CMD]); /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: return nf_log_bind_pf(info->net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: nf_log_unbind_pf(info->net, pf); return 0; } } inst = instance_lookup_get(log, group_num); if (inst && inst->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto out_put; } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfula[NFULA_CFG_FLAGS]) { flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS])); if ((flags & NFULNL_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_ULOG); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_ULOG); if (rcu_access_pointer(nfnl_ct_hook)) { ret = -EAGAIN; goto out_put; } #endif ret = -EOPNOTSUPP; goto out_put; } } if (cmd != NULL) { switch (cmd->command) { case NFULNL_CFG_CMD_BIND: if (inst) { ret = -EBUSY; goto out_put; } inst = instance_create(info->net, group_num, NETLINK_CB(skb).portid, sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; } break; case NFULNL_CFG_CMD_UNBIND: if (!inst) { ret = -ENODEV; goto out; } instance_destroy(log, inst); goto out_put; default: ret = -ENOTSUPP; goto out_put; } } else if (!inst) { ret = -ENODEV; goto out; } if (nfula[NFULA_CFG_MODE]) { struct nfulnl_msg_config_mode *params = nla_data(nfula[NFULA_CFG_MODE]); nfulnl_set_mode(inst, params->copy_mode, ntohl(params->copy_range)); } if (nfula[NFULA_CFG_TIMEOUT]) { __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); nfulnl_set_timeout(inst, ntohl(timeout)); } if (nfula[NFULA_CFG_NLBUFSIZ]) { __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); } if (nfula[NFULA_CFG_QTHRESH]) { __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); nfulnl_set_qthresh(inst, ntohl(qthresh)); } if (nfula[NFULA_CFG_FLAGS]) nfulnl_set_flags(inst, flags); out_put: instance_put(inst); out: return ret; } static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, .type = NFNL_CB_MUTEX, .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFULA_CFG_MAX, .policy = nfula_cfg_policy }, }; static const struct nfnetlink_subsystem nfulnl_subsys = { .name = "log", .subsys_id = NFNL_SUBSYS_ULOG, .cb_count = NFULNL_MSG_MAX, .cb = nfulnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct net *net, struct iter_state *st) { struct nfnl_log_net *log; if (!st) return NULL; log = nfnl_log_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { struct hlist_head *head = &log->instance_table[st->bucket]; if (!hlist_empty(head)) return rcu_dereference_bh(hlist_first_rcu(head)); } return NULL; } static struct hlist_node *get_next(struct net *net, struct iter_state *st, struct hlist_node *h) { h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { struct nfnl_log_net *log; struct hlist_head *head; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; log = nfnl_log_pernet(net); head = &log->instance_table[st->bucket]; h = rcu_dereference_bh(hlist_first_rcu(head)); } return h; } static struct hlist_node *get_idx(struct net *net, struct iter_state *st, loff_t pos) { struct hlist_node *head; head = get_first(net, st); if (head) while (pos && (head = get_next(net, st, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(rcu_bh) { rcu_read_lock_bh(); return get_idx(seq_file_net(s), s->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(seq_file_net(s), s->private, v); } static void seq_stop(struct seq_file *s, void *v) __releases(rcu_bh) { rcu_read_unlock_bh(); } static int seq_show(struct seq_file *s, void *v) { const struct nfulnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %6u %2u\n", inst->group_num, inst->peer_portid, inst->qlen, inst->copy_mode, inst->copy_range, inst->flushtimeout, refcount_read(&inst->use)); return 0; } static const struct seq_operations nful_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_log_net_init(struct net *net) { unsigned int i; struct nfnl_log_net *log = nfnl_log_pernet(net); #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; #endif for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&log->instance_table[i]); spin_lock_init(&log->instances_lock); #ifdef CONFIG_PROC_FS proc = proc_create_net("nfnetlink_log", 0440, net->nf.proc_netfilter, &nful_seq_ops, sizeof(struct iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif return 0; } static void __net_exit nfnl_log_net_exit(struct net *net) { struct nfnl_log_net *log = nfnl_log_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif nf_log_unset(net, &nfulnl_logger); for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&log->instance_table[i])); } static struct pernet_operations nfnl_log_net_ops = { .init = nfnl_log_net_init, .exit = nfnl_log_net_exit, .id = &nfnl_log_net_id, .size = sizeof(struct nfnl_log_net), }; static int __init nfnetlink_log_init(void) { int status; status = register_pernet_subsys(&nfnl_log_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); if (status < 0) { pr_err("failed to register logger\n"); goto cleanup_subsys; } return status; cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_log_net_ops); out: return status; } static void __exit nfnetlink_log_fini(void) { nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); } MODULE_DESCRIPTION("netfilter userspace logging"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); MODULE_ALIAS_NF_LOGGER(AF_INET, 1); MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */ MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */ module_init(nfnetlink_log_init); module_exit(nfnetlink_log_fini);
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */
868 869 867 812 815 5 5 57 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 // SPDX-License-Identifier: GPL-2.0 /* * Performance events ring-buffer code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/circ_buf.h> #include <linux/poll.h> #include <linux/nospec.h> #include "internal.h" static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending_irq); } /* * We need to ensure a later event_id doesn't publish a head when a former * event isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * * We only publish the head (and generate a wakeup) when the outer-most * event completes. */ static void perf_output_get_handle(struct perf_output_handle *handle) { struct perf_buffer *rb = handle->rb; preempt_disable(); /* * Avoid an explicit LOAD/STORE such that architectures with memops * can use them. */ (*(volatile unsigned int *)&rb->nest)++; handle->wakeup = local_read(&rb->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { struct perf_buffer *rb = handle->rb; unsigned long head; unsigned int nest; /* * If this isn't the outermost nesting, we don't have to update * @rb->user_page->data_head. */ nest = READ_ONCE(rb->nest); if (nest > 1) { WRITE_ONCE(rb->nest, nest - 1); goto out; } again: /* * In order to avoid publishing a head value that goes backwards, * we must ensure the load of @rb->head happens after we've * incremented @rb->nest. * * Otherwise we can observe a @rb->head value before one published * by an IRQ/NMI happening between the load and the increment. */ barrier(); head = local_read(&rb->head); /* * IRQ/NMI can happen here and advance @rb->head, causing our * load above to be stale. */ /* * Since the mmap() consumer (userspace) can run on a different CPU: * * kernel user * * if (LOAD ->data_tail) { LOAD ->data_head * (A) smp_rmb() (C) * STORE $data LOAD $data * smp_wmb() (B) smp_mb() (D) * STORE ->data_head STORE ->data_tail * } * * Where A pairs with D, and B pairs with C. * * In our case (A) is a control dependency that separates the load of * the ->data_tail and the stores of $data. In case ->data_tail * indicates there is no room in the buffer to store $data we do not. * * D needs to be a full barrier since it separates the data READ * from the tail WRITE. * * For B a WMB is sufficient since it separates two WRITEs, and for C * an RMB is sufficient since it separates two READs. * * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ WRITE_ONCE(rb->user_page->data_head, head); /* * We must publish the head before decrementing the nest count, * otherwise an IRQ/NMI can publish a more recent head value and our * write will (temporarily) publish a stale value. */ barrier(); WRITE_ONCE(rb->nest, 0); /* * Ensure we decrement @rb->nest before we validate the @rb->head. * Otherwise we cannot be sure we caught the 'last' nested update. */ barrier(); if (unlikely(head != local_read(&rb->head))) { WRITE_ONCE(rb->nest, 1); goto again; } if (handle->wakeup != local_read(&rb->wakeup)) perf_output_wakeup(handle); out: preempt_enable(); } static __always_inline bool ring_buffer_has_space(unsigned long head, unsigned long tail, unsigned long data_size, unsigned int size, bool backward) { if (!backward) return CIRC_SPACE(head, tail, data_size) >= size; else return CIRC_SPACE(tail, head, data_size) >= size; } static __always_inline int __perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size, bool backward) { struct perf_buffer *rb; unsigned long tail, offset, head; int have_lost, page_shift; struct { struct perf_event_header header; u64 id; u64 lost; } lost_event; rcu_read_lock(); /* * For inherited events we send all the output towards the parent. */ if (event->parent) event = event->parent; rb = rcu_dereference(event->rb); if (unlikely(!rb)) goto out; if (unlikely(rb->paused)) { if (rb->nr_pages) { local_inc(&rb->lost); atomic64_inc(&event->lost_samples); } goto out; } handle->rb = rb; handle->event = event; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { size += sizeof(lost_event); if (event->attr.sample_id_all) size += event->id_header_size; } perf_output_get_handle(handle); do { tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); if (!rb->overwrite) { if (unlikely(!ring_buffer_has_space(head, tail, perf_data_size(rb), size, backward))) goto fail; } /* * The above forms a control dependency barrier separating the * @tail load above from the data stores below. Since the @tail * load is required to compute the branch to fail below. * * A, matches D; the full memory barrier userspace SHOULD issue * after reading the data and before storing the new tail * position. * * See perf_output_put_handle(). */ if (!backward) head += size; else head -= size; } while (local_cmpxchg(&rb->head, offset, head) != offset); if (backward) { offset = head; head = (u64)(-head); } /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. */ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); page_shift = PAGE_SHIFT + page_order(rb); handle->page = (offset >> page_shift) & (rb->nr_pages - 1); offset &= (1UL << page_shift) - 1; handle->addr = rb->data_pages[handle->page] + offset; handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); /* XXX mostly redundant; @data is already fully initializes */ perf_event_header__init_id(&lost_event.header, data, event); perf_output_put(handle, lost_event); perf_event__output_id_sample(event, handle, data); } return 0; fail: local_inc(&rb->lost); atomic64_inc(&event->lost_samples); perf_output_put_handle(handle); out: rcu_read_unlock(); return -ENOSPC; } int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, data, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, data, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, data, event, size, unlikely(is_write_backward(event))); } unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { return __output_copy(handle, buf, len); } unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len) { return __output_skip(handle, NULL, len); } void perf_output_end(struct perf_output_handle *handle) { perf_output_put_handle(handle); rcu_read_unlock(); } static void ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) { long max_size = perf_data_size(rb); if (watermark) rb->watermark = min(max_size, watermark); if (!rb->watermark) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) rb->overwrite = 0; else rb->overwrite = 1; refcount_set(&rb->refcount, 1); INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); /* * perf_output_begin() only checks rb->paused, therefore * rb->paused must be true if we have no pages for output. */ if (!rb->nr_pages) rb->paused = 1; mutex_init(&rb->aux_mutex); } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) { /* * OVERWRITE is determined by perf_aux_output_end() and can't * be passed in directly. */ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) return; handle->aux_flags |= flags; } EXPORT_SYMBOL_GPL(perf_aux_output_flag); /* * This is called before hardware starts writing to the AUX area to * obtain an output handle and make sure there's room in the buffer. * When the capture completes, call perf_aux_output_end() to commit * the recorded data to the buffer. * * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. * * Call this from pmu::start(); see the comment in perf_aux_output_end() * about its use in pmu callbacks. Both can also be called from the PMI * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct perf_buffer *rb; unsigned int nest; if (output_event->parent) output_event = output_event->parent; /* * Since this will typically be open across pmu::add/pmu::del, we * grab ring_buffer's refcount instead of holding rcu read lock * to make sure it doesn't disappear under us. */ rb = ring_buffer_get(output_event); if (!rb) return NULL; if (!rb_has_aux(rb)) goto err; /* * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), * about to get freed, so we leave immediately. * * Checking rb::aux_mmap_count and rb::refcount has to be done in * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; nest = READ_ONCE(rb->aux_nest); /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ if (WARN_ON_ONCE(nest)) goto err_put; WRITE_ONCE(rb->aux_nest, nest + 1); aux_head = rb->aux_head; handle->rb = rb; handle->event = event; handle->head = aux_head; handle->size = 0; handle->aux_flags = 0; /* * In overwrite mode, AUX data stores do not depend on aux_tail, * therefore (A) control dependency barrier does not exist. The * (B) <-> (C) ordering is still observed by the pmu driver. */ if (!rb->aux_overwrite) { aux_tail = READ_ONCE(rb->user_page->aux_tail); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); /* * handle->size computation depends on aux_tail load; this forms a * control dependency barrier separating aux_tail load from aux data * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; } } return handle->rb->aux_priv; err_put: /* can't be last */ rb_free_aux(rb); err: ring_buffer_put(rb); handle->event = NULL; return NULL; } EXPORT_SYMBOL_GPL(perf_aux_output_begin); static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb) { if (rb->aux_overwrite) return false; if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); return true; } return false; } /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. * * Note: this has to be called from pmu::stop() callback, as the assumption * of the AUX buffer management code is that after pmu::stop(), the AUX * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); struct perf_buffer *rb = handle->rb; unsigned long aux_head; /* in overwrite mode, driver provides aux_head via handle */ if (rb->aux_overwrite) { handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; rb->aux_head = aux_head; } else { handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; aux_head = rb->aux_head; rb->aux_head += size; } /* * Only send RECORD_AUX if we have something useful to communicate * * Note: the OVERWRITE records by themselves are not considered * useful, as they don't communicate any *new* information, * aside from the short-lived offset, that becomes history at * the next event sched-in and therefore isn't useful. * The userspace that needs to copy out AUX data in overwrite * mode should know to use user_page::aux_head for the actual * offset. So, from now on we don't output AUX records that * have *only* OVERWRITE flag set. */ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) perf_event_aux_event(handle->event, aux_head, size, handle->aux_flags); WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) handle->event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); } handle->event = NULL; WRITE_ONCE(rb->aux_nest, 0); /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); } EXPORT_SYMBOL_GPL(perf_aux_output_end); /* * Skip over a given number of bytes in the AUX buffer, due to, for example, * hardware's alignment constraints. */ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { struct perf_buffer *rb = handle->rb; if (size > handle->size) return -ENOSPC; rb->aux_head += size; WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } handle->head = rb->aux_head; handle->size -= size; return 0; } EXPORT_SYMBOL_GPL(perf_aux_output_skip); void *perf_get_aux(struct perf_output_handle *handle) { /* this is only valid between perf_aux_output_begin and *_end */ if (!handle->event) return NULL; return handle->rb->aux_priv; } EXPORT_SYMBOL_GPL(perf_get_aux); /* * Copy out AUX data from an AUX handle. */ long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to) { struct perf_buffer *rb = aux_handle->rb; unsigned long tocopy, remainder, len = 0; void *addr; from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; do { tocopy = PAGE_SIZE - offset_in_page(from); if (to > from) tocopy = min(tocopy, to - from); if (!tocopy) break; addr = rb->aux_pages[from >> PAGE_SHIFT]; addr += offset_in_page(from); remainder = perf_output_copy(handle, addr, tocopy); if (remainder) return -EFAULT; len += tocopy; from += tocopy; from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; } while (to != from); return len; } #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; if (order > MAX_ORDER) order = MAX_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); } while (!page && order--); if (page && order) { /* * Communicate the allocation size to the driver: * if we managed to secure a high-order allocation, * set its first page's private to this order; * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); set_page_private(page, order); } return page; } static void rb_free_aux_page(struct perf_buffer *rb, int idx) { struct page *page = virt_to_page(rb->aux_pages[idx]); ClearPagePrivate(page); page->mapping = NULL; __free_page(page); } static void __rb_free_aux(struct perf_buffer *rb) { int pg; /* * Should never happen, the last reference should be dropped from * perf_mmap_close() path, which first stops aux transactions (which * in turn are the atomic holders of aux_refcount) and then does the * last rb_free_aux(). */ WARN_ON_ONCE(in_atomic()); if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; rb->aux_priv = NULL; } if (rb->aux_nr_pages) { for (pg = 0; pg < rb->aux_nr_pages; pg++) rb_free_aux_page(rb, pg); kfree(rb->aux_pages); rb->aux_nr_pages = 0; } } int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); int ret = -ENOMEM, max_order; if (!has_aux(event)) return -EOPNOTSUPP; if (!overwrite) { /* * Watermark defaults to half the buffer, and so does the * max_order, to aid PMU drivers in double buffering. */ if (!watermark) watermark = min_t(unsigned long, U32_MAX, (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* * Use aux_watermark as the basis for chunking to * help PMU drivers honor the watermark. */ max_order = get_order(watermark); } else { /* * We need to start with the max_order that fits in nr_pages, * not the other way around, hence ilog2() and not get_order. */ max_order = ilog2(nr_pages); watermark = 0; } /* * kcalloc_node() is unable to allocate buffer if the size is larger * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case. */ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER) return -ENOMEM; rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, node); if (!rb->aux_pages) return -ENOMEM; rb->free_aux = event->pmu->free_aux; for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { struct page *page; int last, order; order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); page = rb_alloc_aux_page(node, order); if (!page) goto out; for (last = rb->aux_nr_pages + (1 << page_private(page)); last > rb->aux_nr_pages; rb->aux_nr_pages++) rb->aux_pages[rb->aux_nr_pages] = page_address(page++); } /* * In overwrite mode, PMUs that don't support SG may not handle more * than one contiguous allocation, since they rely on PMI to do double * buffering. In this case, the entire buffer has to be one contiguous * chunk. */ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && overwrite) { struct page *page = virt_to_page(rb->aux_pages[0]); if (page_private(page) != max_order) goto out; } rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; ret = 0; /* * aux_pages (and pmu driver's private data, aux_priv) will be * referenced in both producer's and consumer's contexts, thus * we keep a refcount here to make sure either of the two can * reference them safely. */ refcount_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; out: if (!ret) rb->aux_pgoff = pgoff; else __rb_free_aux(rb); return ret; } void rb_free_aux(struct perf_buffer *rb) { if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); } #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; if (pgoff == 0) return virt_to_page(rb->user_page); return virt_to_page(rb->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) { struct page *page; int node; node = (cpu == -1) ? cpu : cpu_to_node(cpu); page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) return NULL; return page_address(page); } static void perf_mmap_free_page(void *addr) { struct page *page = virt_to_page(addr); page->mapping = NULL; __free_page(page); } struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *rb; unsigned long size; int i, node; size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); rb = kzalloc_node(size, GFP_KERNEL, node); if (!rb) goto fail; rb->user_page = perf_mmap_alloc_page(cpu); if (!rb->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { rb->data_pages[i] = perf_mmap_alloc_page(cpu); if (!rb->data_pages[i]) goto fail_data_pages; } rb->nr_pages = nr_pages; ring_buffer_init(rb, watermark, flags); return rb; fail_data_pages: for (i--; i >= 0; i--) perf_mmap_free_page(rb->data_pages[i]); perf_mmap_free_page(rb->user_page); fail_user_page: kfree(rb); fail: return NULL; } void rb_free(struct perf_buffer *rb) { int i; perf_mmap_free_page(rb->user_page); for (i = 0; i < rb->nr_pages; i++) perf_mmap_free_page(rb->data_pages[i]); kfree(rb); } #else static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) return NULL; return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) { struct page *page = vmalloc_to_page(addr); page->mapping = NULL; } static void rb_free_work(struct work_struct *work) { struct perf_buffer *rb; void *base; int i, nr; rb = container_of(work, struct perf_buffer, work); nr = data_page_nr(rb); base = rb->user_page; /* The '<=' counts in the user page. */ for (i = 0; i <= nr; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); kfree(rb); } void rb_free(struct perf_buffer *rb) { schedule_work(&rb->work); } struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *rb; unsigned long size; void *all_buf; int node; size = sizeof(struct perf_buffer); size += sizeof(void *); node = (cpu == -1) ? cpu : cpu_to_node(cpu); rb = kzalloc_node(size, GFP_KERNEL, node); if (!rb) goto fail; INIT_WORK(&rb->work, rb_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; if (nr_pages) { rb->nr_pages = 1; rb->page_order = ilog2(nr_pages); } ring_buffer_init(rb, watermark, flags); return rb; fail_all_buf: kfree(rb); fail: return NULL; } #endif struct page * perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { if (rb->aux_nr_pages) { /* above AUX space */ if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) return NULL; /* AUX space */ if (pgoff >= rb->aux_pgoff) { int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); return virt_to_page(rb->aux_pages[aux_pgoff]); } } return __perf_mmap_to_page(rb, pgoff); }
325 313 3 53 131 61 155 564 184 371 425 612 430 26 25 266 294 265 487 489 156 115 39 2 138 6 9 6 9 8 7 129 78 73 4 38 34 4 3 109 47 2 17 363 73 352 57 5 2 614 608 609 159 453 620 3 297 322 620 289 313 628 27 27 7 1 1 1 1 27 27 1 11 26 1 25 1 1 1 23 24 24 24 21 24 3 8 2 5 3 3 8 3 3 3 3 3 47 47 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Generic Netlink Family * * Authors: Jamal Hadi Salim * Thomas Graf <tgraf@suug.ch> * Johannes Berg <johannes@sipsolutions.net> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/mutex.h> #include <linux/bitmap.h> #include <linux/rwsem.h> #include <linux/idr.h> #include <net/sock.h> #include <net/genetlink.h> static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq); void genl_lock(void) { mutex_lock(&genl_mutex); } EXPORT_SYMBOL(genl_lock); void genl_unlock(void) { mutex_unlock(&genl_mutex); } EXPORT_SYMBOL(genl_unlock); static void genl_lock_all(void) { down_write(&cb_lock); genl_lock(); } static void genl_unlock_all(void) { genl_unlock(); up_write(&cb_lock); } static DEFINE_IDR(genl_fam_idr); /* * Bitmap of multicast groups that are currently in use. * * To avoid an allocation at boot of just one unsigned long, * declare it global instead. * Bit 0 is marked as already used since group 0 is invalid. * Bit 1 is marked as already used since the drop-monitor code * abuses the API and thinks it can statically use group 1. * That group will typically conflict with other groups that * any proper users use. * Bit 16 is marked as used since it's used for generic netlink * and the code no longer marks pre-reserved IDs as used. * Bit 17 is marked as already used since the VFS quota code * also abused this API and relied on family == group ID, we * cater to that by giving it a static family and group ID. * Bit 18 is marked as already used since the PMCRAID driver * did the same thing as the VFS quota code (maybe copied?) */ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) | BIT(GENL_ID_VFS_DQUOT) | BIT(GENL_ID_PMCRAID); static unsigned long *mc_groups = &mc_group_start; static unsigned long mc_groups_longs = 1; static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id); static const struct genl_family *genl_family_find_byid(unsigned int id) { return idr_find(&genl_fam_idr, id); } static const struct genl_family *genl_family_find_byname(char *name) { const struct genl_family *family; unsigned int id; idr_for_each_entry(&genl_fam_idr, family, id) if (strcmp(family->name, name) == 0) return family; return NULL; } static int genl_get_cmd_cnt(const struct genl_family *family) { return family->n_ops + family->n_small_ops; } static void genl_op_from_full(const struct genl_family *family, unsigned int i, struct genl_ops *op) { *op = family->ops[i]; if (!op->maxattr) op->maxattr = family->maxattr; if (!op->policy) op->policy = family->policy; } static int genl_get_cmd_full(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_ops; i++) if (family->ops[i].cmd == cmd) { genl_op_from_full(family, i, op); return 0; } return -ENOENT; } static void genl_op_from_small(const struct genl_family *family, unsigned int i, struct genl_ops *op) { memset(op, 0, sizeof(*op)); op->doit = family->small_ops[i].doit; op->dumpit = family->small_ops[i].dumpit; op->cmd = family->small_ops[i].cmd; op->internal_flags = family->small_ops[i].internal_flags; op->flags = family->small_ops[i].flags; op->validate = family->small_ops[i].validate; op->maxattr = family->maxattr; op->policy = family->policy; } static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_small_ops; i++) if (family->small_ops[i].cmd == cmd) { genl_op_from_small(family, i, op); return 0; } return -ENOENT; } static int genl_get_cmd(u32 cmd, const struct genl_family *family, struct genl_ops *op) { if (!genl_get_cmd_full(cmd, family, op)) return 0; return genl_get_cmd_small(cmd, family, op); } static void genl_get_cmd_by_index(unsigned int i, const struct genl_family *family, struct genl_ops *op) { if (i < family->n_ops) genl_op_from_full(family, i, op); else if (i < family->n_ops + family->n_small_ops) genl_op_from_small(family, i - family->n_ops, op); else WARN_ON_ONCE(1); } static int genl_allocate_reserve_groups(int n_groups, int *first_id) { unsigned long *new_groups; int start = 0; int i; int id; bool fits; do { if (start == 0) id = find_first_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG); else id = find_next_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG, start); fits = true; for (i = id; i < min_t(int, id + n_groups, mc_groups_longs * BITS_PER_LONG); i++) { if (test_bit(i, mc_groups)) { start = i; fits = false; break; } } if (id + n_groups > mc_groups_longs * BITS_PER_LONG) { unsigned long new_longs = mc_groups_longs + BITS_TO_LONGS(n_groups); size_t nlen = new_longs * sizeof(unsigned long); if (mc_groups == &mc_group_start) { new_groups = kzalloc(nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; *mc_groups = mc_group_start; } else { new_groups = krealloc(mc_groups, nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; for (i = 0; i < BITS_TO_LONGS(n_groups); i++) mc_groups[mc_groups_longs + i] = 0; } mc_groups_longs = new_longs; } } while (!fits); for (i = id; i < id + n_groups; i++) set_bit(i, mc_groups); *first_id = id; return 0; } static struct genl_family genl_ctrl; static int genl_validate_assign_mc_groups(struct genl_family *family) { int first_id; int n_groups = family->n_mcgrps; int err = 0, i; bool groups_allocated = false; if (!n_groups) return 0; for (i = 0; i < n_groups; i++) { const struct genl_multicast_group *grp = &family->mcgrps[i]; if (WARN_ON(grp->name[0] == '\0')) return -EINVAL; if (WARN_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL)) return -EINVAL; } /* special-case our own group and hacks */ if (family == &genl_ctrl) { first_id = GENL_ID_CTRL; BUG_ON(n_groups != 1); } else if (strcmp(family->name, "NET_DM") == 0) { first_id = 1; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_VFS_DQUOT) { first_id = GENL_ID_VFS_DQUOT; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_PMCRAID) { first_id = GENL_ID_PMCRAID; BUG_ON(n_groups != 1); } else { groups_allocated = true; err = genl_allocate_reserve_groups(n_groups, &first_id); if (err) return err; } family->mcgrp_offset = first_id; /* if still initializing, can't and don't need to realloc bitmaps */ if (!init_net.genl_sock) return 0; if (family->netnsok) { struct net *net; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { err = __netlink_change_ngroups(net->genl_sock, mc_groups_longs * BITS_PER_LONG); if (err) { /* * No need to roll back, can only fail if * memory allocation fails and then the * number of _possible_ groups has been * increased on some sockets which is ok. */ break; } } rcu_read_unlock(); netlink_table_ungrab(); } else { err = netlink_change_ngroups(init_net.genl_sock, mc_groups_longs * BITS_PER_LONG); } if (groups_allocated && err) { for (i = 0; i < family->n_mcgrps; i++) clear_bit(family->mcgrp_offset + i, mc_groups); } return err; } static void genl_unregister_mc_groups(const struct genl_family *family) { struct net *net; int i; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { for (i = 0; i < family->n_mcgrps; i++) __netlink_clear_multicast_users( net->genl_sock, family->mcgrp_offset + i); } rcu_read_unlock(); netlink_table_ungrab(); for (i = 0; i < family->n_mcgrps; i++) { int grp_id = family->mcgrp_offset + i; if (grp_id != 1) clear_bit(grp_id, mc_groups); genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family, &family->mcgrps[i], grp_id); } } static int genl_validate_ops(const struct genl_family *family) { int i, j; if (WARN_ON(family->n_ops && !family->ops) || WARN_ON(family->n_small_ops && !family->small_ops)) return -EINVAL; for (i = 0; i < genl_get_cmd_cnt(family); i++) { struct genl_ops op; genl_get_cmd_by_index(i, family, &op); if (op.dumpit == NULL && op.doit == NULL) return -EINVAL; for (j = i + 1; j < genl_get_cmd_cnt(family); j++) { struct genl_ops op2; genl_get_cmd_by_index(j, family, &op2); if (op.cmd == op2.cmd) return -EINVAL; } } return 0; } /** * genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one * family may be registered with the same family name or identifier. * * The family's ops, multicast groups and module pointer must already * be assigned. * * Return 0 on success or a negative error code. */ int genl_register_family(struct genl_family *family) { int err, i; int start = GENL_START_ALLOC, end = GENL_MAX_ID; err = genl_validate_ops(family); if (err) return err; genl_lock_all(); if (genl_family_find_byname(family->name)) { err = -EEXIST; goto errout_locked; } /* * Sadly, a few cases need to be special-cased * due to them having previously abused the API * and having used their family ID also as their * multicast group ID, so we use reserved IDs * for both to be sure we can do that mapping. */ if (family == &genl_ctrl) { /* and this needs to be special for initial family lookups */ start = end = GENL_ID_CTRL; } else if (strcmp(family->name, "pmcraid") == 0) { start = end = GENL_ID_PMCRAID; } else if (strcmp(family->name, "VFS_DQUOT") == 0) { start = end = GENL_ID_VFS_DQUOT; } family->id = idr_alloc_cyclic(&genl_fam_idr, family, start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; goto errout_locked; } err = genl_validate_assign_mc_groups(family); if (err) goto errout_remove; genl_unlock_all(); /* send all events */ genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0); for (i = 0; i < family->n_mcgrps; i++) genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family, &family->mcgrps[i], family->mcgrp_offset + i); return 0; errout_remove: idr_remove(&genl_fam_idr, family->id); errout_locked: genl_unlock_all(); return err; } EXPORT_SYMBOL(genl_register_family); /** * genl_unregister_family - unregister generic netlink family * @family: generic netlink family * * Unregisters the specified family. * * Returns 0 on success or a negative error code. */ int genl_unregister_family(const struct genl_family *family) { genl_lock_all(); if (!genl_family_find_byid(family->id)) { genl_unlock_all(); return -ENOENT; } genl_unregister_mc_groups(family); idr_remove(&genl_fam_idr, family->id); up_write(&cb_lock); wait_event(genl_sk_destructing_waitq, atomic_read(&genl_sk_destructing_cnt) == 0); genl_unlock(); genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); return 0; } EXPORT_SYMBOL(genl_unregister_family); /** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to * @seq: sequence number (usually the one of the sender) * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header */ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd) { struct nlmsghdr *nlh; struct genlmsghdr *hdr; nlh = nlmsg_put(skb, portid, seq, family->id, GENL_HDRLEN + family->hdrsize, flags); if (nlh == NULL) return NULL; hdr = nlmsg_data(nlh); hdr->cmd = cmd; hdr->version = family->version; hdr->reserved = 0; return (char *) hdr + GENL_HDRLEN; } EXPORT_SYMBOL(genlmsg_put); static struct genl_dumpit_info *genl_dumpit_info_alloc(void) { return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); } static void genl_dumpit_info_free(const struct genl_dumpit_info *info) { kfree(info); } static struct nlattr ** genl_family_rcv_msg_attrs_parse(const struct genl_family *family, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_ops *ops, int hdrlen, enum genl_validate_flags no_strict_flag) { enum netlink_validation validate = ops->validate & no_strict_flag ? NL_VALIDATE_LIBERAL : NL_VALIDATE_STRICT; struct nlattr **attrbuf; int err; if (!ops->maxattr) return NULL; attrbuf = kmalloc_array(ops->maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) return ERR_PTR(-ENOMEM); err = __nlmsg_parse(nlh, hdrlen, attrbuf, ops->maxattr, ops->policy, validate, extack); if (err) { kfree(attrbuf); return ERR_PTR(err); } return attrbuf; } static void genl_family_rcv_msg_attrs_free(struct nlattr **attrbuf) { kfree(attrbuf); } struct genl_start_context { const struct genl_family *family; struct nlmsghdr *nlh; struct netlink_ext_ack *extack; const struct genl_ops *ops; int hdrlen; }; static int genl_start(struct netlink_callback *cb) { struct genl_start_context *ctx = cb->data; const struct genl_ops *ops = ctx->ops; struct genl_dumpit_info *info; struct nlattr **attrs = NULL; int rc = 0; if (ops->validate & GENL_DONT_VALIDATE_DUMP) goto no_attrs; if (ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) return -EINVAL; attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, ops, ctx->hdrlen, GENL_DONT_VALIDATE_DUMP_STRICT); if (IS_ERR(attrs)) return PTR_ERR(attrs); no_attrs: info = genl_dumpit_info_alloc(); if (!info) { genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } info->family = ctx->family; info->op = *ops; info->attrs = attrs; cb->data = info; if (ops->start) { if (!ctx->family->parallel_ops) genl_lock(); rc = ops->start(cb); if (!ctx->family->parallel_ops) genl_unlock(); } if (rc) { genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); cb->data = NULL; } return rc; } static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_ops *ops = &genl_dumpit_info(cb)->op; int rc; genl_lock(); rc = ops->dumpit(skb, cb); genl_unlock(); return rc; } static int genl_lock_done(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); const struct genl_ops *ops = &info->op; int rc = 0; if (ops->done) { genl_lock(); rc = ops->done(cb); genl_unlock(); } genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); return rc; } static int genl_parallel_done(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); const struct genl_ops *ops = &info->op; int rc = 0; if (ops->done) rc = ops->done(cb); genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); return rc; } static int genl_family_rcv_msg_dumpit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_ops *ops, int hdrlen, struct net *net) { struct genl_start_context ctx; int err; if (!ops->dumpit) return -EOPNOTSUPP; ctx.family = family; ctx.nlh = nlh; ctx.extack = extack; ctx.ops = ops; ctx.hdrlen = hdrlen; if (!family->parallel_ops) { struct netlink_dump_control c = { .module = family->module, .data = &ctx, .start = genl_start, .dump = genl_lock_dumpit, .done = genl_lock_done, }; genl_unlock(); err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_lock(); } else { struct netlink_dump_control c = { .module = family->module, .data = &ctx, .start = genl_start, .dump = ops->dumpit, .done = genl_parallel_done, }; err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); } return err; } static int genl_family_rcv_msg_doit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_ops *ops, int hdrlen, struct net *net) { struct nlattr **attrbuf; struct genl_info info; int err; if (!ops->doit) return -EOPNOTSUPP; attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, ops, hdrlen, GENL_DONT_VALIDATE_STRICT); if (IS_ERR(attrbuf)) return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); if (family->pre_doit) { err = family->pre_doit(ops, skb, &info); if (err) goto out; } err = ops->doit(skb, &info); if (family->post_doit) family->post_doit(ops, skb, &info); out: genl_family_rcv_msg_attrs_free(attrbuf); return err; } static int genl_family_rcv_msg(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct genlmsghdr *hdr = nlmsg_data(nlh); struct genl_ops op; int hdrlen; /* this family doesn't exist in this netns */ if (!family->netnsok && !net_eq(net, &init_net)) return -ENOENT; hdrlen = GENL_HDRLEN + family->hdrsize; if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; if (genl_get_cmd(hdr->cmd, family, &op)) return -EOPNOTSUPP; if ((op.flags & GENL_ADMIN_PERM) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((op.flags & GENL_UNS_ADMIN_PERM) && !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, &op, hdrlen, net); else return genl_family_rcv_msg_doit(family, skb, nlh, extack, &op, hdrlen, net); } static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { const struct genl_family *family; int err; family = genl_family_find_byid(nlh->nlmsg_type); if (family == NULL) return -ENOENT; if (!family->parallel_ops) genl_lock(); err = genl_family_rcv_msg(family, skb, nlh, extack); if (!family->parallel_ops) genl_unlock(); return err; } static void genl_rcv(struct sk_buff *skb) { down_read(&cb_lock); netlink_rcv_skb(skb, &genl_rcv_msg); up_read(&cb_lock); } /************************************************************************** * Controller **************************************************************************/ static struct genl_family genl_ctrl; static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) || nla_put_u32(skb, CTRL_ATTR_VERSION, family->version) || nla_put_u32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize) || nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr)) goto nla_put_failure; if (genl_get_cmd_cnt(family)) { struct nlattr *nla_ops; int i; nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS); if (nla_ops == NULL) goto nla_put_failure; for (i = 0; i < genl_get_cmd_cnt(family); i++) { struct nlattr *nest; struct genl_ops op; u32 op_flags; genl_get_cmd_by_index(i, family, &op); op_flags = op.flags; if (op.dumpit) op_flags |= GENL_CMD_CAP_DUMP; if (op.doit) op_flags |= GENL_CMD_CAP_DO; if (op.policy) op_flags |= GENL_CMD_CAP_HASPOL; nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_OP_ID, op.cmd) || nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_ops); } if (family->n_mcgrps) { struct nlattr *nla_grps; int i; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; for (i = 0; i < family->n_mcgrps; i++) { struct nlattr *nest; const struct genl_multicast_group *grp; grp = &family->mcgrps[i]; nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, family->mcgrp_offset + i) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_grps); } genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_fill_mcgrp_info(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; struct nlattr *nla_grps; struct nlattr *nest; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id)) goto nla_put_failure; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; nest = nla_nest_start_noflag(skb, 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); nla_nest_end(skb, nla_grps); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) { int n = 0; struct genl_family *rt; struct net *net = sock_net(skb->sk); int fams_to_skip = cb->args[0]; unsigned int id; idr_for_each_entry(&genl_fam_idr, rt, id) { if (!rt->netnsok && !net_eq(net, &init_net)) continue; if (n++ < fams_to_skip) continue; if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, CTRL_CMD_NEWFAMILY) < 0) { n--; break; } } cb->args[0] = n; return skb->len; } static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_info(family, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static struct sk_buff * ctrl_build_mcgrp_msg(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static const struct nla_policy ctrl_policy_family[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, }; static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; const struct genl_family *res = NULL; int err = -EINVAL; if (info->attrs[CTRL_ATTR_FAMILY_ID]) { u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); res = genl_family_find_byid(id); err = -ENOENT; } if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { char *name; name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); res = genl_family_find_byname(name); #ifdef CONFIG_MODULES if (res == NULL) { genl_unlock(); up_read(&cb_lock); request_module("net-pf-%d-proto-%d-family-%s", PF_NETLINK, NETLINK_GENERIC, name); down_read(&cb_lock); genl_lock(); res = genl_family_find_byname(name); } #endif err = -ENOENT; } if (res == NULL) return err; if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) { /* family doesn't exist here */ return -ENOENT; } msg = ctrl_build_family_msg(res, info->snd_portid, info->snd_seq, CTRL_CMD_NEWFAMILY); if (IS_ERR(msg)) return PTR_ERR(msg); return genlmsg_reply(msg, info); } static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id) { struct sk_buff *msg; /* genl is still initialising */ if (!init_net.genl_sock) return 0; switch (event) { case CTRL_CMD_NEWFAMILY: case CTRL_CMD_DELFAMILY: WARN_ON(grp); msg = ctrl_build_family_msg(family, 0, 0, event); break; case CTRL_CMD_NEWMCAST_GRP: case CTRL_CMD_DELMCAST_GRP: BUG_ON(!grp); msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event); break; default: return -EINVAL; } if (IS_ERR(msg)) return PTR_ERR(msg); if (!family->netnsok) genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0, 0, GFP_KERNEL); else genlmsg_multicast_allns(&genl_ctrl, msg, 0, 0); return 0; } struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; const struct genl_family *rt; unsigned int opidx; u32 op; u16 fam_id; u8 policies:1, single_op:1; }; static const struct nla_policy ctrl_policy_policy[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, [CTRL_ATTR_OP] = { .type = NLA_U32 }, }; static int ctrl_dumppolicy_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->attrs; const struct genl_family *rt; struct genl_ops op; int err, i; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) return -EINVAL; if (tb[CTRL_ATTR_FAMILY_ID]) { ctx->fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); } else { rt = genl_family_find_byname( nla_data(tb[CTRL_ATTR_FAMILY_NAME])); if (!rt) return -ENOENT; ctx->fam_id = rt->id; } rt = genl_family_find_byid(ctx->fam_id); if (!rt) return -ENOENT; ctx->rt = rt; if (tb[CTRL_ATTR_OP]) { ctx->single_op = true; ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); err = genl_get_cmd(ctx->op, rt, &op); if (err) { NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); return err; } if (!op.policy) return -ENODATA; return netlink_policy_dump_add_policy(&ctx->state, op.policy, op.maxattr); } for (i = 0; i < genl_get_cmd_cnt(rt); i++) { genl_get_cmd_by_index(i, rt, &op); if (op.policy) { err = netlink_policy_dump_add_policy(&ctx->state, op.policy, op.maxattr); if (err) goto err_free_state; } } if (!ctx->state) return -ENODATA; return 0; err_free_state: netlink_policy_dump_free(ctx->state); return err; } static void *ctrl_dumppolicy_prep(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_ctrl, NLM_F_MULTI, CTRL_CMD_GETPOLICY); if (!hdr) return NULL; if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, ctx->fam_id)) return NULL; return hdr; } static int ctrl_dumppolicy_put_op(struct sk_buff *skb, struct netlink_callback *cb, struct genl_ops *op) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr *nest_pol, *nest_op; void *hdr; int idx; /* skip if we have nothing to show */ if (!op->policy) return 0; if (!op->doit && (!op->dumpit || op->validate & GENL_DONT_VALIDATE_DUMP)) return 0; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) return -ENOBUFS; nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY); if (!nest_pol) goto err; nest_op = nla_nest_start(skb, op->cmd); if (!nest_op) goto err; /* for now both do/dump are always the same */ idx = netlink_policy_dump_get_policy_idx(ctx->state, op->policy, op->maxattr); if (op->doit && nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) goto err; if (op->dumpit && !(op->validate & GENL_DONT_VALIDATE_DUMP) && nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) goto err; nla_nest_end(skb, nest_op); nla_nest_end(skb, nest_pol); genlmsg_end(skb, hdr); return 0; err: genlmsg_cancel(skb, hdr); return -ENOBUFS; } static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; if (!ctx->policies) { while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { struct genl_ops op; if (ctx->single_op) { int err; err = genl_get_cmd(ctx->op, ctx->rt, &op); if (WARN_ON(err)) return skb->len; /* break out of the loop after this one */ ctx->opidx = genl_get_cmd_cnt(ctx->rt); } else { genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); } if (ctrl_dumppolicy_put_op(skb, cb, &op)) return skb->len; ctx->opidx++; } /* completed with the per-op policy index list */ ctx->policies = true; } while (netlink_policy_dump_loop(ctx->state)) { struct nlattr *nest; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) goto nla_put_failure; nest = nla_nest_start(skb, CTRL_ATTR_POLICY); if (!nest) goto nla_put_failure; if (netlink_policy_dump_write(skb, ctx->state)) goto nla_put_failure; nla_nest_end(skb, nest); genlmsg_end(skb, hdr); } return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); return skb->len; } static int ctrl_dumppolicy_done(struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; netlink_policy_dump_free(ctx->state); return 0; } static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .doit = ctrl_getfamily, .dumpit = ctrl_dumpfamily, }, { .cmd = CTRL_CMD_GETPOLICY, .policy = ctrl_policy_policy, .maxattr = ARRAY_SIZE(ctrl_policy_policy) - 1, .start = ctrl_dumppolicy_start, .dumpit = ctrl_dumppolicy, .done = ctrl_dumppolicy_done, }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; static struct genl_family genl_ctrl __ro_after_init = { .module = THIS_MODULE, .ops = genl_ctrl_ops, .n_ops = ARRAY_SIZE(genl_ctrl_ops), .mcgrps = genl_ctrl_groups, .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups), .id = GENL_ID_CTRL, .name = "nlctrl", .version = 0x2, .netnsok = true, }; static int genl_bind(struct net *net, int group) { const struct genl_family *family; unsigned int id; int ret = 0; genl_lock_all(); idr_for_each_entry(&genl_fam_idr, family, id) { const struct genl_multicast_group *grp; int i; if (family->n_mcgrps == 0) continue; i = group - family->mcgrp_offset; if (i < 0 || i >= family->n_mcgrps) continue; grp = &family->mcgrps[i]; if ((grp->flags & GENL_UNS_ADMIN_PERM) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) ret = -EPERM; if (grp->cap_sys_admin && !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; break; } genl_unlock_all(); return ret; } static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = genl_bind, }; /* we'll bump the group number right afterwards */ net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); if (!net->genl_sock) return -ENOMEM; return 0; } static void __net_exit genl_pernet_exit(struct net *net) { netlink_kernel_release(net->genl_sock); net->genl_sock = NULL; } static struct pernet_operations genl_pernet_ops = { .init = genl_pernet_init, .exit = genl_pernet_exit, }; static int __init genl_init(void) { int err; err = genl_register_family(&genl_ctrl); if (err < 0) goto problem; err = register_pernet_subsys(&genl_pernet_ops); if (err) goto problem; return 0; problem: panic("GENL: Cannot register controller: %d\n", err); } core_initcall(genl_init); static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group) { struct sk_buff *tmp; struct net *net, *prev = NULL; bool delivered = false; int err; rcu_read_lock(); for_each_net_rcu(net) { if (prev) { tmp = skb_clone(skb, GFP_ATOMIC); if (!tmp) { err = -ENOMEM; goto error; } err = nlmsg_multicast(prev->genl_sock, tmp, portid, group, GFP_ATOMIC); if (!err) delivered = true; else if (err != -ESRCH) goto error; } prev = net; } err = nlmsg_multicast(prev->genl_sock, skb, portid, group, GFP_ATOMIC); rcu_read_unlock(); if (!err) delivered = true; else if (err != -ESRCH) return err; return delivered ? 0 : -ESRCH; error: rcu_read_unlock(); kfree_skb(skb); return err; } int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group); } EXPORT_SYMBOL(genlmsg_multicast_allns); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags) { struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, nlmsg_report(info->nlhdr), flags); } EXPORT_SYMBOL(genl_notify);
233 1 233 234 234 5 233 816 818 234 190 190 148 147 148 148 148 148 148 1 148 148 14 40 223 148 148 227 227 227 227 1 227 4 2 1 233 234 233 233 189 189 8 47 148 234 189 234 234 4 230 234 234 234 234 226 338 341 189 265 266 149 149 814 817 814 816 817 339 190 185 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] static inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * We do not migrate the timer when it is expiring before the next * event on the target cpu. When high resolution is enabled, we cannot * reprogram the target cpu hardware and we would cause it to fire * late. To keep it simple, we handle the high resolution enabled and * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires < new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static inline int hrtimer_hres_active(void) { return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectively block all timers until the T2 event * fires. */ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } static void retrigger_next_event(void *arg); /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level * resume code. * * This is only invoked when: * - CONFIG_HIGH_RES_TIMERS is enabled. * - CONFIG_NOHZ_COMMON is enabled * * For the other cases this function is empty and because the call sites * are optimized out it vanishes as well, i.e. no need for lots of * #ifdeffery. */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* * When high resolution mode or nohz is active, then the offsets of * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the * next tick will take care of that. * * If high resolution mode is active then the next expiring timer * must be reevaluated and the clock event device reprogrammed if * necessary. * * In the NOHZ case the update of the offset and the reevaluation * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ if (!__hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); if (__hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); raw_spin_unlock(&base->lock); } /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; if (expires >= cpu_base->expires_next) return; /* * If the hrtimer interrupt is running, then it will reevaluate the * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; cpu_base->next_timer = timer; __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; ktime_t expires; /* * Update the base offsets unconditionally so the following * checks whether the SMP function call is required works. * * The update is safe even when the remote CPU is in the hrtimer * interrupt or the hrtimer soft interrupt and expiring affected * bases. Either it will see the update before handling a base or * it will see it when it finishes the processing and reevaluates * the next expiring timer. */ seq = cpu_base->clock_was_set_seq; hrtimer_update_base(cpu_base); /* * If the sequence did not change over the update then the * remote CPU already handled it. */ if (seq == cpu_base->clock_was_set_seq) return false; /* * If the remote CPU is currently handling an hrtimer interrupt, it * will reevaluate the first expiring timer of all clock bases * before reprogramming. Nothing to do here. */ if (cpu_base->in_hrtirq) return false; /* * Walk the affected clock bases and check whether the first expiring * timer in a clock base is moving ahead of the first expiring timer of * @cpu_base. If so, the IPI must be invoked because per CPU clock * event devices cannot be remotely reprogrammed. */ active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; next = timerqueue_getnext(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; if (expires < cpu_base->softirq_expires_next) return true; } return false; } /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). * * This requires to update the offsets for these clocks * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this * also requires to eventually reprogram the per CPU clock event devices * when the change moves an affected timer ahead of the first expiring * timer on that CPU. Obviously remote per CPU clock event devices cannot * be reprogrammed. The other reason why an IPI has to be sent is when the * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(unsigned int bases) { struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { on_each_cpu(retrigger_next_event, NULL, 1); goto out_timerfd; } /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } preempt_disable(); smp_call_function_many(mask, retrigger_next_event, NULL, 1); preempt_enable(); cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping code to reprogram the hrtimer interrupt device * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * Called during resume either directly from via timekeeping_resume() * or in the case of s2idle from tick_unfreeze() to ensure that the * hrtimers are up to date. */ void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. * * Can be safely called from the callback function of @timer. If * called from other contexts @timer must neither be enqueued nor * running the callback and the caller needs to take care of * serialization. * * Note: This only updates the timer expiry value and does not requeue * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffie) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; if (WARN_ON_ONCE(!timer->function)) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (__hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } /* called with interrupts disabled */ static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; if (!hrtimer_hres_active()) return; td = this_cpu_ptr(&tick_cpu_device); if (td && td->evtdev) hrtimer_interrupt(td->evtdev); } #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (__hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_init_sleeper - initialize sleeper to the given clock * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) freezable_schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = &current->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = &current->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } int hrtimers_cpu_starting(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { struct hrtimer_cpu_base *old_base, *new_base; int i, ncpu = cpumask_first(cpu_active_mask); tick_cancel_sched_timer(dying_cpu); old_base = this_cpu_ptr(&hrtimer_bases); new_base = &per_cpu(hrtimer_bases, ncpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * @clock_id: timer clock to be used */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } /* * Override any slack passed by the user if under * rt contraints. */ if (rt_task(current)) delta = 0; hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout);
1781 1774 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * No idle tick implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* * The time, when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64bit can do a quick check without holding jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32bit cannot do that because the store of tick_next_period * consists of two 32bit stores and the first store could move it * to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on jiffies_lock and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Reevaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the jiffies_seq protected job */ jiffies_64 += ticks; /* * Keep the tick_next_period variable up to date. */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above and ensures that the update to jiffies_64 is * not reordered vs. the store to tick_next_period, neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32bit as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but jiffies_lock needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. * * If nohz_full is enabled, this should not happen because the * tick_do_timer_cpu never relinquishes. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif tick_do_timer_cpu = cpu; } #endif /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); /* * If jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (ts->inidle) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { #ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on complete idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } #endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(&current->tick_dep_mask)) return false; if (check_tick_dependency(&current->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, IPI can then be spared. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * by unstable clock. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage events throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote irq work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU need this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { if (atomic_read(&current->tick_dep_mask) || atomic_read(&current->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The tick_do_timer_cpu CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return ts->tick_stopped; } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } /* * Updates the per-CPU time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; } if (last_update_time) *last_update_time = ktime_to_us(now); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) { update_ts_time_stats(cpu, ts, now, last_update_time); idle = ts->idle_sleeptime; } else { if (ts->idle_active && !nr_iowait_cpu(cpu)) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(ts->idle_sleeptime, delta); } else { idle = ts->idle_sleeptime; } } return ktime_to_us(idle); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) { update_ts_time_stats(cpu, ts, now, last_update_time); iowait = ts->iowait_sleeptime; } else { if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); iowait = ktime_add(ts->iowait_sleeptime, delta); } else { iowait = ts->iowait_sleeptime; } } return ktime_to_us(iowait); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long basejiff; unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that check whether the local timer softirq is * pending. If so its a bad idea to call get_next_timer_interrupt() * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tmr = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tmr; /* Take the next rcu event into account */ next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): */ timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); if (cpu != tick_do_timer_cpu && (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; ktime_t tick = expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { ts->do_timer_last = 0; } /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when * interrupts arrive which do not cause a reschedule. In the * first call we save the current tick time, so we can restart * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = tick; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; __tick_nohz_full_update_tick(ts, ktime_get()); } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; /* * Make sure the CPU doesn't get fooled by obsolete tick * deadline if it comes back online later. */ ts->next_tick = 0; return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; if (need_resched()) return false; if (unlikely(local_softirq_pending())) { static int ratelimit; if (ratelimit < 10 && !local_bh_blocked() && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); ratelimit++; } return false; } if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_do_timer_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { ktime_t expires; int cpu = smp_processor_id(); /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = ts->tick_stopped; tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); /* * Undo the effect of get_next_timer_interrupt() called from * tick_nohz_next_event(). */ timer_clear_idle(); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 1; tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - update next tick event from interrupt exit * * When an interrupt fires while we are idle and it doesn't cause * a reschedule, it may still add, modify or delete a timer, enqueue * an RCU callback, etc... * So we need to re-calculate and reprogram the next tick event. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whatever that expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!ts->inidle); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than next_event, the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } /** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick * accounting. Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle * This also exit the RCU extended quiescent state. The CPU * can use RCU again after this function is called. */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; idle_active = ts->idle_active; tick_stopped = ts->tick_stopped; if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * The nohz low res interrupt handler */ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); dev->next_event = KTIME_MAX; tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); if (unlikely(ts->tick_stopped)) { /* * The clockevent device is not reprogrammed, so change the * clock event device to ONESHOT_STOPPED to avoid spurious * interrupts on devices which might not be truly one shot. */ tick_program_event(KTIME_MAX, 1); return; } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { if (!tick_nohz_enabled) return; ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to nohz mode */ static void tick_nohz_switch_to_nohz(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_handler)) return; /* * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ next = tick_init_jiffy_update(); hrtimer_set_expires(&ts->sched_timer, next); hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle. We may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } /* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have * no valid regs pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer */ void tick_setup_sched_timer(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ts->sched_timer.function = tick_sched_timer; /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } #endif /** * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /** * Check, if a change happened, which makes oneshot possible. * * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (ts->nohz_mode != NOHZ_MODE_INACTIVE) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; }
13 11 5 13 4 8 1 1 8 8 4 1 3 4 4 2 2 2 2 2 4 4 4 4 2 2 3 1 4 12 12 10 3 12 1 12 6 5 1 1 1 6 9 2 1 6 8 4 7 1 7 3 13 10 13 13 13 4 4 4 4 1 3 24 24 24 24 1 4 4 22 3 3 3 3 5 5 5 5 43 19 19 19 13 5 904 899 13 11 7 2 2 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> * * The code this is based on carried the following copyright notice: * --- * (C) Copyright 2001-2006 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com * Re-worked by Ben Greear <greearb@candelatech.com> * --- */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/net_tstamp.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/if_link.h> #include <linux/if_macvlan.h> #include <linux/hash.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <linux/netpoll.h> #include <linux/phy.h> #define MACVLAN_HASH_BITS 8 #define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS) #define MACVLAN_DEFAULT_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 #define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; struct list_head vlans; struct sk_buff_head bc_queue; struct work_struct bc_work; u32 bc_queue_len_used; u32 flags; int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { struct hlist_node hlist; struct macvlan_dev *vlan; unsigned char addr[6+2] __aligned(sizeof(u16)); struct rcu_head rcu; }; struct macvlan_skb_cb { const struct macvlan_dev *src; }; #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0])) static void macvlan_port_destroy(struct net_device *dev); static void update_port_bc_queue_len(struct macvlan_port *port); static inline bool macvlan_passthru(const struct macvlan_port *port) { return port->flags & MACVLAN_F_PASSTHRU; } static inline void macvlan_set_passthru(struct macvlan_port *port) { port->flags |= MACVLAN_F_PASSTHRU; } static inline bool macvlan_addr_change(const struct macvlan_port *port) { return port->flags & MACVLAN_F_ADDRCHANGE; } static inline void macvlan_set_addr_change(struct macvlan_port *port) { port->flags |= MACVLAN_F_ADDRCHANGE; } static inline void macvlan_clear_addr_change(struct macvlan_port *port) { port->flags &= ~MACVLAN_F_ADDRCHANGE; } /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, MACVLAN_HASH_BITS); } static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } static struct macvlan_source_entry *macvlan_hash_lookup_source( const struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(entry->addr, addr) && entry->vlan == vlan) return entry; } return NULL; } static int macvlan_hash_add_source(struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_port *port = vlan->port; struct macvlan_source_entry *entry; struct hlist_head *h; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) return 0; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; ether_addr_copy(entry->addr, addr); entry->vlan = vlan; h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; hlist_add_head_rcu(&entry->hlist, h); vlan->macaddr_count++; return 0; } static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; u32 idx = macvlan_eth_hash(addr); hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); } static void macvlan_hash_del_source(struct macvlan_source_entry *entry) { hlist_del_rcu(&entry->hlist); kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); if (sync) synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ memcpy(vlan->dev->dev_addr, addr, ETH_ALEN); macvlan_hash_add(vlan); } static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) return true; return false; } static int macvlan_broadcast_one(struct sk_buff *skb, const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { struct net_device *dev = vlan->dev; if (local) return __dev_forward_skb(dev, skb); skb->dev = dev; if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; return 0; } static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) { return (u32)(((unsigned long)vlan) >> L1_CACHE_SHIFT); } static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { u32 val = __get_unaligned_cpu32(addr + 2); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); } static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port, struct net_device *src, enum macvlan_mode mode) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; struct sk_buff *nskb; unsigned int i; int err; unsigned int hash; if (skb->protocol == htons(ETH_P_PAUSE)) return; hash_for_each_rcu(port->vlan_hash, i, vlan, hlist) { if (vlan->dev == src || !(vlan->mode & mode)) continue; hash = mc_hash(vlan, eth->h_dest); if (!test_bit(hash, vlan->mc_filter)) continue; err = NET_RX_DROP; nskb = skb_clone(skb, GFP_ATOMIC); if (likely(nskb)) err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE) ?: netif_rx_ni(nskb); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, true); } } static void macvlan_process_broadcast(struct work_struct *w) { struct macvlan_port *port = container_of(w, struct macvlan_port, bc_work); struct sk_buff *skb; struct sk_buff_head list; __skb_queue_head_init(&list); spin_lock_bh(&port->bc_queue.lock); skb_queue_splice_tail_init(&port->bc_queue, &list); spin_unlock_bh(&port->bc_queue.lock); while ((skb = __skb_dequeue(&list))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; rcu_read_lock(); if (!src) /* frame comes from an external address */ macvlan_broadcast(skb, port, NULL, MACVLAN_MODE_PRIVATE | MACVLAN_MODE_VEPA | MACVLAN_MODE_PASSTHRU| MACVLAN_MODE_BRIDGE); else if (src->mode == MACVLAN_MODE_VEPA) /* flood to everyone except source */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA | MACVLAN_MODE_BRIDGE); else /* * flood only to VEPA ports, bridge ports * already saw the frame on the way out. */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA); rcu_read_unlock(); if (src) dev_put(src->dev); consume_skb(skb); cond_resched(); } } static void macvlan_broadcast_enqueue(struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { struct sk_buff *nskb; int err = -ENOMEM; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) goto err; MACVLAN_SKB_CB(nskb)->src = src; spin_lock(&port->bc_queue.lock); if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) { if (src) dev_hold(src->dev); __skb_queue_tail(&port->bc_queue, nskb); err = 0; } spin_unlock(&port->bc_queue.lock); schedule_work(&port->bc_work); if (err) goto free_nskb; return; free_nskb: kfree_skb(nskb); err: atomic_long_inc(&skb->dev->rx_dropped); } static void macvlan_flush_sources(struct macvlan_port *port, struct macvlan_dev *vlan) { struct macvlan_source_entry *entry; struct hlist_node *next; int i; hash_for_each_safe(port->vlan_source_hash, i, next, entry, hlist) if (entry->vlan == vlan) macvlan_hash_del_source(entry); vlan->macaddr_count = 0; } static void macvlan_forward_source_one(struct sk_buff *skb, struct macvlan_dev *vlan) { struct sk_buff *nskb; struct net_device *dev; int len; int ret; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; len = nskb->len + ETH_HLEN; nskb->dev = dev; if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr)) nskb->pkt_type = PACKET_HOST; ret = netif_rx(nskb); macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { if (ether_addr_equal_64bits(entry->addr, addr)) { if (entry->vlan->flags & MACVLAN_FLAG_NODST) consume = true; macvlan_forward_source_one(skb, entry->vlan); } } return consume; } /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { struct macvlan_port *port; struct sk_buff *skb = *pskb; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; struct net_device *dev; unsigned int len = 0; int ret; rx_handler_result_t handle_res; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; port = macvlan_port_get_rcu(skb->dev); if (is_multicast_ether_addr(eth->h_dest)) { unsigned int hash; skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { /* forward to original port. */ vlan = src; ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?: netif_rx(skb); handle_res = RX_HANDLER_CONSUMED; goto out; } hash = mc_hash(NULL, eth->h_dest); if (test_bit(hash, port->mc_filter)) macvlan_broadcast_enqueue(port, src, skb); return RX_HANDLER_PASS; } if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); else vlan = macvlan_hash_lookup(port, eth->h_dest); if (!vlan || vlan->mode == MACVLAN_MODE_SOURCE) return RX_HANDLER_PASS; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { ret = NET_RX_DROP; handle_res = RX_HANDLER_CONSUMED; goto out; } *pskb = skb; skb->dev = dev; skb->pkt_type = PACKET_HOST; ret = NET_RX_SUCCESS; handle_res = RX_HANDLER_ANOTHER; out: macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); return handle_res; } static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = skb_eth_hdr(skb); /* send to other bridge ports directly */ if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); goto xmit_world; } dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { /* send to lowerdev first for its network taps */ dev_forward_skb(vlan->lowerdev, skb); return NET_XMIT_SUCCESS; } } xmit_world: skb->dev = vlan->lowerdev; return dev_queue_xmit_accel(skb, netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); unsigned int len = skb->len; int ret; if (unlikely(netpoll_tx_running(dev))) return macvlan_netpoll_send_skb(vlan, skb); ret = macvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); pcpu_stats->tx_packets++; pcpu_stats->tx_bytes += len; u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(vlan->pcpu_stats->tx_dropped); } return ret; } static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return dev_hard_header(skb, lowerdev, type, daddr, saddr ? : dev->dev_addr, len); } static const struct header_ops macvlan_hard_header_ops = { .create = macvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, }; static int macvlan_open(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; int err; if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto out; } goto hash_add; } err = -EADDRINUSE; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; /* Attempt to populate accel_priv which is used to offload the L2 * forwarding requests for unicast packets. */ if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) vlan->accel_priv = lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev); /* If earlier attempt to offload failed, or accel_priv is not * populated we must add the unicast address to the lower device. */ if (IS_ERR_OR_NULL(vlan->accel_priv)) { vlan->accel_priv = NULL; err = dev_uc_add(lowerdev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(lowerdev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto clear_multi; } hash_add: macvlan_hash_add(vlan); return 0; clear_multi: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); del_unicast: if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } else { dev_uc_del(lowerdev, dev->dev_addr); } out: return err; } static int macvlan_stop(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) dev_set_promiscuity(lowerdev, -1); goto hash_del; } if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(lowerdev, -1); dev_uc_del(lowerdev, dev->dev_addr); hash_del: macvlan_hash_del(vlan, !dev->dismantle); return 0; } static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { /* Just copy in the new address */ eth_hw_addr_set(dev, addr); } else { /* Rehash and update the device filters */ if (macvlan_addr_busy(vlan->port, addr)) return -EADDRINUSE; if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; dev_uc_del(lowerdev, dev->dev_addr); } macvlan_hash_change_addr(vlan, addr); } if (macvlan_passthru(port) && !macvlan_addr_change(port)) { /* Since addr_change isn't set, we are here due to lower * device change. Save the lower-dev address so we can * restore it later. */ ether_addr_copy(vlan->port->perm_addr, lowerdev->dev_addr); } macvlan_clear_addr_change(port); return 0; } static int macvlan_set_mac_address(struct net_device *dev, void *p) { struct macvlan_dev *vlan = netdev_priv(dev); struct sockaddr *addr = p; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; /* If the addresses are the same, this is a no-op */ if (ether_addr_equal(dev->dev_addr, addr->sa_data)) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { macvlan_set_addr_change(vlan->port); return dev_set_mac_address(vlan->lowerdev, addr, NULL); } if (macvlan_addr_busy(vlan->port, addr->sa_data)) return -EADDRINUSE; return macvlan_sync_address(dev, addr->sa_data); } static void macvlan_change_rx_flags(struct net_device *dev, int change) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void macvlan_compute_filter(unsigned long *mc_filter, struct net_device *dev, struct macvlan_dev *vlan) { if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ); } else { struct netdev_hw_addr *ha; DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ); bitmap_zero(filter, MACVLAN_MC_FILTER_SZ); netdev_for_each_mc_addr(ha, dev) { __set_bit(mc_hash(vlan, ha->addr), filter); } __set_bit(mc_hash(vlan, dev->broadcast), filter); bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ); } } static void macvlan_set_mac_lists(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); macvlan_compute_filter(vlan->mc_filter, dev, vlan); dev_uc_sync(vlan->lowerdev, dev); dev_mc_sync(vlan->lowerdev, dev); /* This is slightly inaccurate as we're including the subscription * list of vlan->lowerdev too. * * Bug alert: This only works if everyone has the same broadcast * address as lowerdev. As soon as someone changes theirs this * will break. * * However, this is already broken as when you change your broadcast * address we don't get called. * * The solution is to maintain a list of broadcast addresses like * we do for uc/mc, if you care. */ macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL); } static int macvlan_change_mtu(struct net_device *dev, int new_mtu) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->lowerdev->mtu < new_mtu) return -EINVAL; dev->mtu = new_mtu; return 0; } static int macvlan_eth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = macvlan_dev_real_dev(dev); const struct net_device_ops *ops = real_dev->netdev_ops; struct ifreq ifrr; int err = -EOPNOTSUPP; strscpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { case SIOCSHWTSTAMP: if (!net_eq(dev_net(dev), &init_net)) break; fallthrough; case SIOCGHWTSTAMP: if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; } if (!err) ifr->ifr_ifru = ifrr.ifr_ifru; return err; } /* * macvlan network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key macvlan_netdev_addr_lock_key; #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) #define ALWAYS_ON_FEATURES (ALWAYS_ON_OFFLOADS | NETIF_F_LLTX) #define MACVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_LRO | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static void macvlan_set_lockdep_class(struct net_device *dev) { netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macvlan_netdev_addr_lock_key); } static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); const struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; dev->state = (dev->state & ~MACVLAN_STATE_MASK) | (lowerdev->state & MACVLAN_STATE_MASK); dev->features = lowerdev->features & MACVLAN_FEATURES; dev->features |= ALWAYS_ON_FEATURES; dev->hw_features |= NETIF_F_LRO; dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; dev->gso_max_size = lowerdev->gso_max_size; dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; port->count += 1; return 0; } static void macvlan_uninit(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; free_percpu(vlan->pcpu_stats); macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); } static void macvlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->pcpu_stats) { struct vlan_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_errors = 0, tx_dropped = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { start = u64_stats_fetch_begin_irq(&p->syncp); rx_packets = p->rx_packets; rx_bytes = p->rx_bytes; rx_multicast = p->rx_multicast; tx_packets = p->tx_packets; tx_bytes = p->tx_bytes; } while (u64_stats_fetch_retry_irq(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* rx_errors & tx_dropped are u32, updated * without syncp protection. */ rx_errors += p->rx_errors; tx_dropped += p->tx_dropped; } stats->rx_errors = rx_errors; stats->rx_dropped = rx_errors; stats->tx_dropped = tx_dropped; } } static int macvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return vlan_vid_add(lowerdev, proto, vid); } static int macvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; vlan_vid_del(lowerdev, proto, vid); return 0; } static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); return err; } static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } static void macvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strlcpy(drvinfo->driver, "macvlan", sizeof(drvinfo->driver)); strlcpy(drvinfo->version, "0.1", sizeof(drvinfo->version)); } static int macvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct macvlan_dev *vlan = netdev_priv(dev); return __ethtool_get_link_ksettings(vlan->lowerdev, cmd); } static int macvlan_ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { struct net_device *real_dev = macvlan_dev_real_dev(dev); const struct ethtool_ops *ops = real_dev->ethtool_ops; struct phy_device *phydev = real_dev->phydev; if (phy_has_tsinfo(phydev)) { return phy_ts_info(phydev, info); } else if (ops->get_ts_info) { return ops->get_ts_info(real_dev, info); } else { info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; } return 0; } static netdev_features_t macvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct macvlan_dev *vlan = netdev_priv(dev); netdev_features_t lowerdev_features = vlan->lowerdev->features; netdev_features_t mask; features |= NETIF_F_ALL_FOR_ALL; features &= (vlan->set_features | ~MACVLAN_FEATURES); mask = features; lowerdev_features &= (features | ~NETIF_F_LRO); features = netdev_increment_features(lowerdev_features, features, mask); features |= ALWAYS_ON_FEATURES; features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES); return features; } #ifdef CONFIG_NET_POLL_CONTROLLER static void macvlan_dev_poll_controller(struct net_device *dev) { return; } static int macvlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; struct netpoll *netpoll; int err; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void macvlan_dev_netpoll_cleanup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int macvlan_dev_get_iflink(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return vlan->lowerdev->ifindex; } static const struct ethtool_ops macvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = macvlan_ethtool_get_link_ksettings, .get_drvinfo = macvlan_ethtool_get_drvinfo, .get_ts_info = macvlan_ethtool_get_ts_info, }; static const struct net_device_ops macvlan_netdev_ops = { .ndo_init = macvlan_init, .ndo_uninit = macvlan_uninit, .ndo_open = macvlan_open, .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, .ndo_change_mtu = macvlan_change_mtu, .ndo_eth_ioctl = macvlan_eth_ioctl, .ndo_fix_features = macvlan_fix_features, .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_rx_mode = macvlan_set_mac_lists, .ndo_get_stats64 = macvlan_dev_get_stats64, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = macvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = macvlan_vlan_rx_kill_vid, .ndo_fdb_add = macvlan_fdb_add, .ndo_fdb_del = macvlan_fdb_del, .ndo_fdb_dump = ndo_dflt_fdb_dump, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = macvlan_dev_poll_controller, .ndo_netpoll_setup = macvlan_dev_netpoll_setup, .ndo_netpoll_cleanup = macvlan_dev_netpoll_cleanup, #endif .ndo_get_iflink = macvlan_dev_get_iflink, .ndo_features_check = passthru_features_check, .ndo_change_proto_down = dev_change_proto_down_generic, }; void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */ dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->netdev_ops = &macvlan_netdev_ops; dev->needs_free_netdev = true; dev->header_ops = &macvlan_hard_header_ops; dev->ethtool_ops = &macvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(macvlan_common_setup); static void macvlan_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; } static int macvlan_port_create(struct net_device *dev) { struct macvlan_port *port; unsigned int i; int err; if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; if (netdev_is_rx_handler_busy(dev)) return -EBUSY; port = kzalloc(sizeof(*port), GFP_KERNEL); if (port == NULL) return -ENOMEM; port->dev = dev; ether_addr_copy(port->perm_addr, dev->dev_addr); INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_source_hash[i]); port->bc_queue_len_used = 0; skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); if (err) kfree(port); else dev->priv_flags |= IFF_MACVLAN_PORT; return err; } static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = macvlan_port_get_rtnl(dev); struct sk_buff *skb; dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); /* After this point, no packet can schedule bc_work anymore, * but we need to cancel it and purge left skbs if any. */ cancel_work_sync(&port->bc_work); while ((skb = __skb_dequeue(&port->bc_queue))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; if (src) dev_put(src->dev); kfree_skb(skb); } /* If the lower device address has been changed by passthru * macvlan, put it back. */ if (macvlan_passthru(port) && !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { struct sockaddr sa; sa.sa_family = port->dev->type; memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len); dev_set_mac_address(port->dev, &sa, NULL); } kfree(port); } static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *nla, *head; int rem, len; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; if (data[IFLA_MACVLAN_FLAGS] && nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { case MACVLAN_MODE_PRIVATE: case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: case MACVLAN_MODE_SOURCE: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { case MACVLAN_MACADDR_ADD: case MACVLAN_MACADDR_DEL: case MACVLAN_MACADDR_FLUSH: case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR]) { if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) return -EADDRNOTAVAIL; } if (data[IFLA_MACVLAN_MACADDR_DATA]) { head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { if (nla_type(nla) != IFLA_MACVLAN_MACADDR || nla_len(nla) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(nla))) return -EADDRNOTAVAIL; } } if (data[IFLA_MACVLAN_MACADDR_COUNT]) return -EINVAL; return 0; } /* * reconfigure list of remote source mac address * (only for macvlan devices in source mode) * Note regarding alignment: all netlink data is aligned to 4 Byte, which * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. */ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, struct nlattr *data[]) { char *addr = NULL; int ret, rem, len; struct nlattr *nla, *head; struct macvlan_source_entry *entry; if (data[IFLA_MACVLAN_MACADDR]) addr = nla_data(data[IFLA_MACVLAN_MACADDR]); if (mode == MACVLAN_MACADDR_ADD) { if (!addr) return -EINVAL; return macvlan_hash_add_source(vlan, addr); } else if (mode == MACVLAN_MACADDR_DEL) { if (!addr) return -EINVAL; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) { macvlan_hash_del_source(entry); vlan->macaddr_count--; } } else if (mode == MACVLAN_MACADDR_FLUSH) { macvlan_flush_sources(vlan->port, vlan); } else if (mode == MACVLAN_MACADDR_SET) { macvlan_flush_sources(vlan->port, vlan); if (addr) { ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } if (!data[IFLA_MACVLAN_MACADDR_DATA]) return 0; head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { addr = nla_data(nla); ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } } else { return -EINVAL; } return 0; } int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; struct net_device *lowerdev; int err; int macmode; bool create = false; if (!tb[IFLA_LINK]) return -EINVAL; lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; /* When creating macvlans or macvtaps on top of other macvlans - use * the real device as the lowerdev. */ if (netif_is_macvlan(lowerdev)) lowerdev = macvlan_dev_real_dev(lowerdev); if (!tb[IFLA_MTU]) dev->mtu = lowerdev->mtu; else if (dev->mtu > lowerdev->mtu) return -EINVAL; /* MTU range: 68 - lowerdev->max_mtu */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = lowerdev->max_mtu; if (!tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (!netif_is_macvlan_port(lowerdev)) { err = macvlan_port_create(lowerdev); if (err < 0) return err; create = true; } port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ if (macvlan_passthru(port)) { /* The macvlan port must be not created this time, * still goto destroy_macvlan_port for readability. */ err = -EINVAL; goto destroy_macvlan_port; } vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; vlan->set_features = MACVLAN_FEATURES; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); if (data && data[IFLA_MACVLAN_FLAGS]) vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { if (port->count) { err = -EINVAL; goto destroy_macvlan_port; } macvlan_set_passthru(port); eth_hw_addr_inherit(dev, lowerdev); } if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) { err = -EINVAL; goto destroy_macvlan_port; } macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); err = macvlan_changelink_sources(vlan, macmode, data); if (err) goto destroy_macvlan_port; } vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN; if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); err = register_netdevice(dev); if (err < 0) goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev, extack); if (err) goto unregister_netdev; list_add_tail_rcu(&vlan->list, &port->vlans); update_port_bc_queue_len(vlan->port); netif_stacked_transfer_operstate(lowerdev, dev); linkwatch_fire_event(dev); return 0; unregister_netdev: /* macvlan_uninit would free the macvlan port */ unregister_netdevice(dev); return err; destroy_macvlan_port: /* the macvlan port may be freed by macvlan_uninit when fail to register. * so we destroy the macvlan port only when it's valid. */ if (create && macvlan_port_get_rtnl(lowerdev)) { macvlan_flush_sources(port, vlan); macvlan_port_destroy(port->dev); } return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); static int macvlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return macvlan_common_newlink(src_net, dev, tb, data, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->mode == MACVLAN_MODE_SOURCE) macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); update_port_bc_queue_len(vlan->port); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); } EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; enum macvlan_macaddr_mode macmode; int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { set_mode = true; mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); /* Passthrough mode can't be set or cleared dynamically */ if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; if (vlan->mode == MACVLAN_MODE_SOURCE && vlan->mode != mode) macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC; if (macvlan_passthru(vlan->port) && promisc) { int err; if (flags & MACVLAN_FLAG_NOPROMISC) err = dev_set_promiscuity(vlan->lowerdev, -1); else err = dev_set_promiscuity(vlan->lowerdev, 1); if (err < 0) return err; } vlan->flags = flags; } if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) { vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); update_port_bc_queue_len(vlan->port); } if (set_mode) vlan->mode = mode; if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) return -EINVAL; macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); ret = macvlan_changelink_sources(vlan, macmode, data); if (ret) return ret; } return 0; } static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) { if (vlan->macaddr_count == 0) return 0; return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); } static size_t macvlan_get_size(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */ ); } static int macvlan_fill_info_macaddr(struct sk_buff *skb, const struct macvlan_dev *vlan, const int i) { struct hlist_head *h = &vlan->port->vlan_source_hash[i]; struct macvlan_source_entry *entry; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (entry->vlan != vlan) continue; if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) return 1; } return 0; } static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; int i; struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) goto nla_put_failure; if (vlan->macaddr_count > 0) { nest = nla_nest_start_noflag(skb, IFLA_MACVLAN_MACADDR_DATA); if (nest == NULL) goto nla_put_failure; for (i = 0; i < MACVLAN_HASH_SIZE; i++) { if (macvlan_fill_info_macaddr(skb, vlan, i)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT }, }; int macvlan_link_register(struct rtnl_link_ops *ops) { /* common fields */ ops->validate = macvlan_validate; ops->maxtype = IFLA_MACVLAN_MAX; ops->policy = macvlan_policy; ops->changelink = macvlan_changelink; ops->get_size = macvlan_get_size; ops->fill_info = macvlan_fill_info; return rtnl_link_register(ops); }; EXPORT_SYMBOL_GPL(macvlan_link_register); static struct net *macvlan_get_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", .setup = macvlan_setup, .newlink = macvlan_newlink, .dellink = macvlan_dellink, .get_link_net = macvlan_get_link_net, .priv_size = sizeof(struct macvlan_dev), }; static void update_port_bc_queue_len(struct macvlan_port *port) { u32 max_bc_queue_len_req = 0; struct macvlan_dev *vlan; list_for_each_entry(vlan, &port->vlans, list) { if (vlan->bc_queue_len_req > max_bc_queue_len_req) max_bc_queue_len_req = vlan->bc_queue_len_req; } port->bc_queue_len_used = max_bc_queue_len_req; } static int macvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan, *next; struct macvlan_port *port; LIST_HEAD(list_kill); if (!netif_is_macvlan_port(dev)) return NOTIFY_DONE; port = macvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(vlan, &port->vlans, list) netif_stacked_transfer_operstate(vlan->lowerdev, vlan->dev); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { vlan->dev->gso_max_size = dev->gso_max_size; vlan->dev->gso_max_segs = dev->gso_max_segs; netdev_update_features(vlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(vlan, &port->vlans, list) { if (vlan->dev->mtu <= dev->mtu) continue; dev_set_mtu(vlan->dev, dev->mtu); } break; case NETDEV_CHANGEADDR: if (!macvlan_passthru(port)) return NOTIFY_DONE; vlan = list_first_entry_or_null(&port->vlans, struct macvlan_dev, list); if (vlan && macvlan_sync_address(vlan->dev, dev->dev_addr)) return NOTIFY_BAD; break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(vlan, next, &port->vlans, list) vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill); unregister_netdevice_many(&list_kill); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to all vlans */ list_for_each_entry(vlan, &port->vlans, list) call_netdevice_notifiers(event, vlan->dev); } return NOTIFY_DONE; } static struct notifier_block macvlan_notifier_block __read_mostly = { .notifier_call = macvlan_device_event, }; static int __init macvlan_init_module(void) { int err; register_netdevice_notifier(&macvlan_notifier_block); err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; err1: unregister_netdevice_notifier(&macvlan_notifier_block); return err; } static void __exit macvlan_cleanup_module(void) { rtnl_link_unregister(&macvlan_link_ops); unregister_netdevice_notifier(&macvlan_notifier_block); } module_init(macvlan_init_module); module_exit(macvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Driver for MAC address based VLANs"); MODULE_ALIAS_RTNL_LINK("macvlan");
142 142 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 // SPDX-License-Identifier: GPL-2.0-only /* Expectation handling for nf_conntrack. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (c) 2005-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); struct hlist_head *nf_ct_expect_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static siphash_key_t nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { struct { union nf_inet_addr dst_addr; u32 net_mix; u16 dport; u8 l3num; u8 protonum; } __aligned(SIPHASH_ALIGNMENT) combined; u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); memset(&combined, 0, sizeof(combined)); combined.dst_addr = tuple->dst.u3; combined.net_mix = net_hash_mix(n); combined.dport = (__force __u16)tuple->dst.u.all; combined.l3num = tuple->src.l3num; combined.protonum = tuple->dst.protonum; hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } static bool nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_expect *i, const struct nf_conntrack_zone *zone, const struct net *net) { return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && net_eq(net, nf_ct_net(i->master)) && nf_ct_zone_equal_any(i->master, zone); } bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; } return false; } EXPORT_SYMBOL_GPL(nf_ct_remove_expect); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; } EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); i = __nf_ct_expect_find(net, zone, tuple); if (i && !refcount_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); return i; } EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } } if (!exp) return NULL; /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (!nf_ct_is_confirmed(exp->master)) return NULL; /* Avoid race with other CPUs, that for exp->master ct, is * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } /* Undo exp->master refcnt increase, if del_timer() failed */ nf_ct_put(exp->master); return NULL; } /* delete all expectations for this conntrack */ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { nf_ct_remove_expect(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); /* Would two expected things clash? */ static inline int expect_clash(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct nf_conntrack_tuple_mask intersect_mask; int count; intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ intersect_mask.src.u3.all[count] = a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static bool master_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b, unsigned int flags) { if (flags & NF_CT_EXP_F_SKIP_MASTER) return true; return a->master == b->master; } /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_remove_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); /* We don't increase the master conntrack refcount for non-fulfilled * conntracks. During the conntrack destruction, the expectations are * always killed before the conntrack itself */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) { struct nf_conntrack_expect *new; new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); if (!new) return NULL; new->master = me; refcount_set(&new->use, 1); return new; } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) { int len; if (family == AF_INET) len = 4; else len = 16; exp->flags = 0; exp->class = class; exp->expectfn = NULL; exp->helper = NULL; exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; if (saddr) { memcpy(&exp->tuple.src.u3, saddr, len); if (sizeof(exp->tuple.src.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.src.u3 + len, 0x00, sizeof(exp->tuple.src.u3) - len); memset(&exp->mask.src.u3, 0xFF, len); if (sizeof(exp->mask.src.u3) > len) memset((void *)&exp->mask.src.u3 + len, 0x00, sizeof(exp->mask.src.u3) - len); } else { memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); } if (src) { exp->tuple.src.u.all = *src; exp->mask.src.u.all = htons(0xFFFF); } else { exp->tuple.src.u.all = 0; exp->mask.src.u.all = 0; } memcpy(&exp->tuple.dst.u3, daddr, len); if (sizeof(exp->tuple.dst.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.dst.u3 + len, 0x00, sizeof(exp->tuple.dst.u3) - len); exp->tuple.dst.u.all = *dst; #if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif } EXPORT_SYMBOL_GPL(nf_ct_expect_init); static void nf_ct_expect_free_rcu(struct rcu_head *head) { struct nf_conntrack_expect *exp; exp = container_of(head, struct nf_conntrack_expect, rcu); kmem_cache_free(nf_ct_expect_cachep, exp); } void nf_ct_expect_put(struct nf_conntrack_expect *exp) { if (refcount_dec_and_test(&exp->use)) call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0); helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; } add_timer(&exp->timeout); hlist_add_head_rcu(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, struct nf_conntrack_expect *new) { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } if (last) nf_ct_remove_expect(last); } static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, unsigned int flags) { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; int ret = 0; if (!master_help) { ret = -ESHUTDOWN; goto out; } h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (master_matches(i, expect, flags) && expect_matches(i, expect)) { if (i->class != expect->class || i->master != expect->master) return -EALREADY; if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; } } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); if (master_help->expecting[expect->class] >= p->max_expected) { ret = -EMFILE; goto out; } } } cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } out: return ret; } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags) { int ret; spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect, flags); if (ret < 0) goto out; nf_ct_expect_insert(expect); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) continue; if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net); #ifdef CONFIG_NF_CONNTRACK_PROCFS struct ct_expect_iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } return NULL; } static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head = ct_expect_get_first(seq); if (head) while (pos && (head = ct_expect_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *exp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ct_expect_get_idx(seq, *pos); } static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return ct_expect_get_next(seq, v); } static void exp_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int exp_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_expect *expect; struct nf_conntrack_helper *helper; struct hlist_node *n = v; char *delim = ""; expect = hlist_entry(n, struct nf_conntrack_expect, hnode); if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); else seq_puts(s, "- "); seq_printf(s, "l3proto = %u proto=%u ", expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); delim = ","; } if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); delim = ","; } if (expect->flags & NF_CT_EXPECT_USERSPACE) seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } seq_putc(s, '\n'); return 0; } static const struct seq_operations exp_seq_ops = { .start = exp_seq_start, .next = exp_seq_next, .stop = exp_seq_stop, .show = exp_seq_show }; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ static int exp_proc_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net, &exp_seq_ops, sizeof(struct ct_expect_iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ return 0; } static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); } int nf_conntrack_expect_init(void) { if (!nf_ct_expect_hsize) { nf_ct_expect_hsize = nf_conntrack_htable_size / 256; if (!nf_ct_expect_hsize) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", sizeof(struct nf_conntrack_expect), 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (!nf_ct_expect_hash) { kmem_cache_destroy(nf_ct_expect_cachep); return -ENOMEM; } return 0; } void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); kvfree(nf_ct_expect_hash); }
3 3 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/ethtool.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/bonding.h> #include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct bonding *bond = PDE_DATA(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; bond_for_each_slave_rcu(bond, slave, iter) if (++off == *pos) return slave; return NULL; } static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bonding *bond = PDE_DATA(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; ++*pos; if (v == SEQ_START_TOKEN) return bond_first_slave_rcu(bond); bond_for_each_slave_rcu(bond, slave, iter) { if (found) return slave; if (slave == v) found = true; } return NULL; } static void bond_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = PDE_DATA(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; curr = rcu_dereference(bond->curr_active_slave); seq_printf(seq, "Bonding Mode: %s", bond_mode_name(BOND_MODE(bond))); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac) { optval = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); seq_printf(seq, " (fail_over_mac %s)", optval->string); } seq_printf(seq, "\n"); if (bond_mode_uses_xmit_hash(bond)) { optval = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); seq_printf(seq, "Transmit Hash Policy: %s (%d)\n", optval->string, bond->params.xmit_policy); } if (bond_uses_primary(bond)) { primary = rcu_dereference(bond->primary_slave); seq_printf(seq, "Primary Slave: %s", primary ? primary->dev->name : "None"); if (primary) { optval = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); seq_printf(seq, " (primary_reselect %s)", optval->string); } seq_printf(seq, "\nCurrently Active Slave: %s\n", (curr) ? curr->dev->name : "None"); } seq_printf(seq, "MII Status: %s\n", netif_carrier_ok(bond->dev) ? "up" : "down"); seq_printf(seq, "MII Polling Interval (ms): %d\n", bond->params.miimon); seq_printf(seq, "Up Delay (ms): %d\n", bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); seq_printf(seq, "Peer Notification Delay (ms): %d\n", bond->params.peer_notif_delay * bond->params.miimon); /* ARP information */ if (bond->params.arp_interval > 0) { int printed = 0; seq_printf(seq, "ARP Polling Interval (ms): %d\n", bond->params.arp_interval); seq_printf(seq, "ARP Missed Max: %u\n", bond->params.missed_max); seq_printf(seq, "ARP IP target/s (n.n.n.n form):"); for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { if (!bond->params.arp_targets[i]) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI4", &bond->params.arp_targets[i]); printed = 1; } seq_printf(seq, "\n"); } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP active: %s\n", (bond->params.lacp_active) ? "on" : "off"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); optval = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", optval->string); if (capable(CAP_NET_ADMIN)) { seq_printf(seq, "System priority: %d\n", BOND_AD_INFO(bond).system.sys_priority); seq_printf(seq, "System MAC address: %pM\n", &BOND_AD_INFO(bond).system.sys_mac_addr); if (__bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", bond->dev->name); } else { seq_printf(seq, "Active Aggregator Info:\n"); seq_printf(seq, "\tAggregator ID: %d\n", ad_info.aggregator_id); seq_printf(seq, "\tNumber of ports: %d\n", ad_info.ports); seq_printf(seq, "\tActor Key: %d\n", ad_info.actor_key); seq_printf(seq, "\tPartner Key: %d\n", ad_info.partner_key); seq_printf(seq, "\tPartner Mac Address: %pM\n", ad_info.partner_system); } } } } static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { struct bonding *bond = PDE_DATA(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); if (slave->speed == SPEED_UNKNOWN) seq_printf(seq, "Speed: %s\n", "Unknown"); else seq_printf(seq, "Speed: %d Mbps\n", slave->speed); if (slave->duplex == DUPLEX_UNKNOWN) seq_printf(seq, "Duplex: %s\n", "Unknown"); else seq_printf(seq, "Duplex: %s\n", slave->duplex ? "full" : "half"); seq_printf(seq, "Link Failure Count: %u\n", slave->link_failure_count); seq_printf(seq, "Permanent HW addr: %*phC\n", slave->dev->addr_len, slave->perm_hwaddr); seq_printf(seq, "Slave queue ID: %d\n", slave->queue_id); if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; const struct aggregator *agg = port->aggregator; if (agg) { seq_printf(seq, "Aggregator ID: %d\n", agg->aggregator_identifier); seq_printf(seq, "Actor Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_actor_state)); seq_printf(seq, "Partner Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_partner_state)); seq_printf(seq, "Actor Churned Count: %d\n", port->churn_actor_count); seq_printf(seq, "Partner Churned Count: %d\n", port->churn_partner_count); if (capable(CAP_NET_ADMIN)) { seq_puts(seq, "details actor lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->actor_system_priority); seq_printf(seq, " system mac address: %pM\n", &port->actor_system); seq_printf(seq, " port key: %d\n", port->actor_oper_port_key); seq_printf(seq, " port priority: %d\n", port->actor_port_priority); seq_printf(seq, " port number: %d\n", port->actor_port_number); seq_printf(seq, " port state: %d\n", port->actor_oper_port_state); seq_puts(seq, "details partner lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->partner_oper.system_priority); seq_printf(seq, " system mac address: %pM\n", &port->partner_oper.system); seq_printf(seq, " oper key: %d\n", port->partner_oper.key); seq_printf(seq, " port priority: %d\n", port->partner_oper.port_priority); seq_printf(seq, " port number: %d\n", port->partner_oper.port_number); seq_printf(seq, " port state: %d\n", port->partner_oper.port_state); } } else { seq_puts(seq, "Aggregator ID: N/A\n"); } } } static int bond_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "%s\n", bond_version); bond_info_show_master(seq); } else bond_info_show_slave(seq, v); return 0; } static const struct seq_operations bond_info_seq_ops = { .start = bond_info_seq_start, .next = bond_info_seq_next, .stop = bond_info_seq_stop, .show = bond_info_seq_show, }; void bond_create_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir) { bond->proc_entry = proc_create_seq_data(bond_dev->name, 0444, bn->proc_dir, &bond_info_seq_ops, bond); if (bond->proc_entry == NULL) netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n", DRV_NAME, bond_dev->name); else memcpy(bond->proc_file_name, bond_dev->name, IFNAMSIZ); } } void bond_remove_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir && bond->proc_entry) { remove_proc_entry(bond->proc_file_name, bn->proc_dir); memset(bond->proc_file_name, 0, IFNAMSIZ); bond->proc_entry = NULL; } } /* Create the bonding directory under /proc/net, if doesn't exist yet. * Caller must hold rtnl_lock. */ void __net_init bond_create_proc_dir(struct bond_net *bn) { if (!bn->proc_dir) { bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); if (!bn->proc_dir) pr_warn("Warning: Cannot create /proc/net/%s\n", DRV_NAME); } } /* Destroy the bonding directory under /proc/net, if empty. * Caller must hold rtnl_lock. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { if (bn->proc_dir) { remove_proc_entry(DRV_NAME, bn->net->proc_net); bn->proc_dir = NULL; } }
11 11 11 8 8 8 8 8 8 8 8 8 8 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peer.h" #include "device.h" #include "queueing.h" #include "timers.h" #include "peerlookup.h" #include "noise.h" #include <linux/kref.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/list.h> static struct kmem_cache *peer_cache; static atomic64_t peer_counter = ATOMIC64_INIT(0); struct wg_peer *wg_peer_create(struct wg_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN], const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]) { struct wg_peer *peer; int ret = -ENOMEM; lockdep_assert_held(&wg->device_update_lock); if (wg->num_peers >= MAX_PEERS_PER_DEVICE) return ERR_PTR(ret); peer = kmem_cache_zalloc(peer_cache, GFP_KERNEL); if (unlikely(!peer)) return ERR_PTR(ret); if (unlikely(dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))) goto err; peer->device = wg; wg_noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer); peer->internal_id = atomic64_inc_return(&peer_counter); peer->serial_work_cpu = nr_cpumask_bits; wg_cookie_init(&peer->latest_cookie); wg_timers_init(peer); wg_cookie_checker_precompute_peer_keys(peer); spin_lock_init(&peer->keypairs.keypair_update_lock); INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); wg_prev_queue_init(&peer->tx_queue); wg_prev_queue_init(&peer->rx_queue); rwlock_init(&peer->endpoint_lock); kref_init(&peer->refcount); skb_queue_head_init(&peer->staged_packet_queue); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); set_bit(NAPI_STATE_NO_BUSY_POLL, &peer->napi.state); netif_napi_add(wg->dev, &peer->napi, wg_packet_rx_poll, NAPI_POLL_WEIGHT); napi_enable(&peer->napi); list_add_tail(&peer->peer_list, &wg->peer_list); INIT_LIST_HEAD(&peer->allowedips_list); wg_pubkey_hashtable_add(wg->peer_hashtable, peer); ++wg->num_peers; pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); return peer; err: kmem_cache_free(peer_cache, peer); return ERR_PTR(ret); } struct wg_peer *wg_peer_get_maybe_zero(struct wg_peer *peer) { RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Taking peer reference without holding the RCU read lock"); if (unlikely(!peer || !kref_get_unless_zero(&peer->refcount))) return NULL; return peer; } static void peer_make_dead(struct wg_peer *peer) { /* Remove from configuration-time lookup structures. */ list_del_init(&peer->peer_list); wg_allowedips_remove_by_peer(&peer->device->peer_allowedips, peer, &peer->device->device_update_lock); wg_pubkey_hashtable_remove(peer->device->peer_hashtable, peer); /* Mark as dead, so that we don't allow jumping contexts after. */ WRITE_ONCE(peer->is_dead, true); /* The caller must now synchronize_net() for this to take effect. */ } static void peer_remove_after_dead(struct wg_peer *peer) { WARN_ON(!peer->is_dead); /* No more keypairs can be created for this peer, since is_dead protects * add_new_keypair, so we can now destroy existing ones. */ wg_noise_keypairs_clear(&peer->keypairs); /* Destroy all ongoing timers that were in-flight at the beginning of * this function. */ wg_timers_stop(peer); /* The transition between packet encryption/decryption queues isn't * guarded by is_dead, but each reference's life is strictly bounded by * two generations: once for parallel crypto and once for serial * ingestion, so we can simply flush twice, and be sure that we no * longer have references inside these queues. */ /* a) For encrypt/decrypt. */ flush_workqueue(peer->device->packet_crypt_wq); /* b.1) For send (but not receive, since that's napi). */ flush_workqueue(peer->device->packet_crypt_wq); /* b.2.1) For receive (but not send, since that's wq). */ napi_disable(&peer->napi); /* b.2.1) It's now safe to remove the napi struct, which must be done * here from process context. */ netif_napi_del(&peer->napi); /* Ensure any workstructs we own (like transmit_handshake_work or * clear_peer_work) no longer are in use. */ flush_workqueue(peer->device->handshake_send_wq); /* After the above flushes, a peer might still be active in a few * different contexts: 1) from xmit(), before hitting is_dead and * returning, 2) from wg_packet_consume_data(), before hitting is_dead * and returning, 3) from wg_receive_handshake_packet() after a point * where it has processed an incoming handshake packet, but where * all calls to pass it off to timers fails because of is_dead. We won't * have new references in (1) eventually, because we're removed from * allowedips; we won't have new references in (2) eventually, because * wg_index_hashtable_lookup will always return NULL, since we removed * all existing keypairs and no more can be created; we won't have new * references in (3) eventually, because we're removed from the pubkey * hash table, which allows for a maximum of one handshake response, * via the still-uncleared index hashtable entry, but not more than one, * and in wg_cookie_message_consume, the lookup eventually gets a peer * with a refcount of zero, so no new reference is taken. */ --peer->device->num_peers; wg_peer_put(peer); } /* We have a separate "remove" function make sure that all active places where * a peer is currently operating will eventually come to an end and not pass * their reference onto another context. */ void wg_peer_remove(struct wg_peer *peer) { if (unlikely(!peer)) return; lockdep_assert_held(&peer->device->device_update_lock); peer_make_dead(peer); synchronize_net(); peer_remove_after_dead(peer); } void wg_peer_remove_all(struct wg_device *wg) { struct wg_peer *peer, *temp; LIST_HEAD(dead_peers); lockdep_assert_held(&wg->device_update_lock); /* Avoid having to traverse individually for each one. */ wg_allowedips_free(&wg->peer_allowedips, &wg->device_update_lock); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { peer_make_dead(peer); list_add_tail(&peer->peer_list, &dead_peers); } synchronize_net(); list_for_each_entry_safe(peer, temp, &dead_peers, peer_list) peer_remove_after_dead(peer); } static void rcu_release(struct rcu_head *rcu) { struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); dst_cache_destroy(&peer->endpoint_cache); WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); /* The final zeroing takes care of clearing any remaining handshake key * material and other potentially sensitive information. */ memzero_explicit(peer, sizeof(*peer)); kmem_cache_free(peer_cache, peer); } static void kref_release(struct kref *refcount) { struct wg_peer *peer = container_of(refcount, struct wg_peer, refcount); pr_debug("%s: Peer %llu (%pISpfsc) destroyed\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr); /* Remove ourself from dynamic runtime lookup structures, now that the * last reference is gone. */ wg_index_hashtable_remove(peer->device->index_hashtable, &peer->handshake.entry); /* Remove any lingering packets that didn't have a chance to be * transmitted. */ wg_packet_purge_staged_packets(peer); /* Free the memory used. */ call_rcu(&peer->rcu, rcu_release); } void wg_peer_put(struct wg_peer *peer) { if (unlikely(!peer)) return; kref_put(&peer->refcount, kref_release); } int __init wg_peer_init(void) { peer_cache = KMEM_CACHE(wg_peer, 0); return peer_cache ? 0 : -ENOMEM; } void wg_peer_uninit(void) { kmem_cache_destroy(peer_cache); }
341 337 339 339 339 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET implementation * * Authors: * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> */ #include <linux/types.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/in.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/sock_reuseport.h> int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; struct flowi4 *fl4; struct rtable *rt; __be32 saddr; int oif; int err; if (addr_len < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; sk_dst_reset(sk); oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; } fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); goto out; } if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { ip_rt_put(rt); err = -EACCES; goto out; } if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; /* Update source address */ if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = fl4->saddr; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); inet->inet_id = prandom_u32(); sk_dst_set(sk, &rt->dst); err = 0; out: return err; } EXPORT_SYMBOL(__ip4_datagram_connect); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip4_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL(ip4_datagram_connect); /* Because UDP xmit path can manipulate sk_dst_cache without holding * socket lock, we need to use sk_dst_set() here, * even if we own the socket lock. */ void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { rcu_read_unlock(); return; } inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NETFILTER_INGRESS_H_ #define _NETFILTER_INGRESS_H_ #include <linux/netfilter.h> #include <linux/netdevice.h> #ifdef CONFIG_NETFILTER_INGRESS static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; int ret; /* Must recheck the ingress hook head, in the event it became NULL * after the check in nf_hook_ingress_active evaluated to true. */ if (unlikely(!e)) return 0; nf_hook_state_init(&state, NF_NETDEV_INGRESS, NFPROTO_NETDEV, skb->dev, NULL, NULL, dev_net(skb->dev), NULL); ret = nf_hook_slow(skb, &state, e, 0); if (ret == 0) return -1; return ret; } static inline void nf_hook_ingress_init(struct net_device *dev) { RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) { return 0; } static inline int nf_hook_ingress(struct sk_buff *skb) { return 0; } static inline void nf_hook_ingress_init(struct net_device *dev) {} #endif /* CONFIG_NETFILTER_INGRESS */ #endif /* _NETFILTER_INGRESS_H_ */
113 110 111 1 111 19 3 110 106 107 101 18 5 18 18 4 18 106 103 104 1 1 100 99 4 1 112 2 116 115 113 113 21 1 103 102 18 5 4 5 107 20 100 5 18 18 4 24 101 100 1 101 99 7 18 18 17 19 104 103 18 107 105 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for SCTP. * * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_NONE] = "NONE", [SCTP_CONNTRACK_CLOSED] = "CLOSED", [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT #define sCE SCTP_CONNTRACK_COOKIE_ECHOED #define sES SCTP_CONNTRACK_ESTABLISHED #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sIV SCTP_CONNTRACK_MAX /* These are the descriptions of the states: NOTE: These state names are tantalizingly similar to the states of an SCTP endpoint. But the interpretation of the states is a little different, considering that these are the states of the connection and not of an end point. Please note the subtleties. -Kiran NONE - Nothing so far. COOKIE WAIT - We have seen an INIT chunk in the original direction, or also an INIT_ACK chunk in the reply direction. COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. */ /* TODO - I have assumed that the first INIT is in the original direction. This messes things when an INIT comes in the reply direction in CLOSED state. - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - Full Multi Homing support. */ /* SCTP conntrack state transitions */ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ /* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ /* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, /* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #endif #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned long *map) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; int flag; flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { pr_debug("Basic checks failed\n"); return 1; } if (map) set_bit(sch->type, map); } pr_debug("Basic checks passed\n"); return count == 0; } static int sctp_new_state(enum ip_conntrack_dir dir, enum sctp_conntrack cur_state, int chunk_type) { int i; pr_debug("Chunk type: %d\n", chunk_type); switch (chunk_type) { case SCTP_CID_INIT: pr_debug("SCTP_CID_INIT\n"); i = 0; break; case SCTP_CID_INIT_ACK: pr_debug("SCTP_CID_INIT_ACK\n"); i = 1; break; case SCTP_CID_ABORT: pr_debug("SCTP_CID_ABORT\n"); i = 2; break; case SCTP_CID_SHUTDOWN: pr_debug("SCTP_CID_SHUTDOWN\n"); i = 3; break; case SCTP_CID_SHUTDOWN_ACK: pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); i = 4; break; case SCTP_CID_ERROR: pr_debug("SCTP_CID_ERROR\n"); i = 5; break; case SCTP_CID_COOKIE_ECHO: pr_debug("SCTP_CID_COOKIE_ECHO\n"); i = 6; break; case SCTP_CID_COOKIE_ACK: pr_debug("SCTP_CID_COOKIE_ACK\n"); i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; case SCTP_CID_HEARTBEAT: pr_debug("SCTP_CID_HEARTBEAT"); i = 9; break; case SCTP_CID_HEARTBEAT_ACK: pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", sctp_conntrack_names[cur_state]); return cur_state; } pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", dir, sctp_conntrack_names[cur_state], chunk_type, sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); return sctp_conntracks[dir][i][cur_state]; } /* Don't need lock here: this conntrack not in circulation yet */ static noinline bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct sctphdr *sh, unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u32 offset, count; memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ if (new_state == SCTP_CONNTRACK_NONE || new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return false; } /* Copy the vtag into the state info */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _inithdr, *ih; /* Sec 8.5.1 (A) */ if (sh->vtag) return false; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(_inithdr), &_inithdr); if (!ih) return false; pr_debug("Setting vtag %x for new conn\n", ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; } static bool sctp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { const struct sctphdr *sh; const char *logmsg; if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; } skb->ip_summed = CHECKSUM_UNNECESSARY; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); const struct sctphdr *sh; struct sctphdr _sctph; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; if (do_basic_checks(ct, skb, dataoff, map) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ if (test_bit(SCTP_CID_ABORT, map) || test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || test_bit(SCTP_CID_COOKIE_ACK, map)) return -NF_ACCEPT; if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; } /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { pr_debug("Verification tag check failed\n"); goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* (B) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* (C) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ACK) { ct->proto.sctp.init[dir] = 0; ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.last_dir = dir; ignore = true; continue; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || ct->proto.sctp.last_dir == dir) goto out_unlock; ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.vtag[dir] = sh->vtag; ct->proto.sctp.vtag[!dir] = 0; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } old_state = ct->proto.sctp.state; new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " "conntrack=%u\n", dir, sch->type, old_state); goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _ih, *ih; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) ct->proto.sctp.init[!dir] = 0; ct->proto.sctp.init[dir] = 1; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so * port reuse by client or NAT middlebox cannot * keep entry alive indefinitely (incl. nat info). */ if (new_state == SCTP_CONNTRACK_CLOSED && old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; } else if (sch->type == SCTP_CID_INIT_ACK) { struct sctp_inithdr _ih, *ih; __be32 vtag; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; vtag = ct->proto.sctp.vtag[!dir]; if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) goto out_unlock; /* collision */ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && vtag != ih->init_tag) goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (new_state == SCTP_CONNTRACK_ESTABLISHED && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } } spin_unlock_bh(&ct->lock); /* allow but do not refresh timeout */ if (ignore) return NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; out_unlock: spin_unlock_bh(&ct->lock); out: return -NF_ACCEPT; } static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { case SCTP_CONNTRACK_SHUTDOWN_SENT: case SCTP_CONNTRACK_SHUTDOWN_RECD: case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; #define SCTP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 4) + \ NLA_ALIGN(NLA_HDRLEN + 4)) static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; int err; /* updates may not contain the internal protocol info, skip parsing */ if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, sctp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_SCTP_STATE] || !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) return -EINVAL; spin_lock_bh(&ct->lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; if (!timeouts) timeouts = sn->timeouts; /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } static int sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; /* timeouts[0] is unused, init it so ->timeouts[0] contains * 'new' timeout, like udp or icmp. */ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .can_early_drop = sctp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_SCTP_MAX, .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
8 5 30 30 30 30 30 18 5 13 13 18 4 4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 /* * * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * * Cgroup v2 * Copyright (C) 2019 Red Hat, Inc. * Author: Giuseppe Scrivano <gscrivan@redhat.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #include <linux/cgroup.h> #include <linux/page_counter.h> #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) #define hugetlb_cgroup_from_counter(counter, idx) \ container_of(counter, struct hugetlb_cgroup, hugepage[idx]) static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline struct page_counter * __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, bool rsvd) { if (rsvd) return &h_cg->rsvd_hugepage[idx]; return &h_cg->hugepage[idx]; } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) { return (h_cg == root_h_cgroup); } static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { int idx; for (idx = 0; idx < hugetlb_max_hstate; idx++) { if (page_counter_read( hugetlb_cgroup_counter_from_cgroup(h_cg, idx))) return true; } return false; } static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, struct hugetlb_cgroup *parent_h_cgroup) { int idx; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { struct page_counter *fault_parent = NULL; struct page_counter *rsvd_parent = NULL; unsigned long limit; int ret; if (parent_h_cgroup) { fault_parent = hugetlb_cgroup_counter_from_cgroup( parent_h_cgroup, idx); rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( parent_h_cgroup, idx); } page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), fault_parent); page_counter_init( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), rsvd_parent); limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), limit); VM_BUG_ON(ret); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), limit); VM_BUG_ON(ret); } } static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cgroup; h_cgroup = hugetlb_cgroup_from_css(css); kfree(h_cgroup); } /* * Should be called with hugetlb_lock held. * Since we are holding hugetlb_lock, pages cannot get moved from * active list or uncharged from the cgroup, So no need to get * page reference and test for page active here. This function * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page *page) { unsigned int nr_pages; struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); page_hcg = hugetlb_cgroup_from_page(page); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ if (!page_hcg || page_hcg != h_cg) goto out; nr_pages = compound_nr(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(page, parent); out: return; } /* * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; int idx; do { idx = 0; for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(idx, h_cg, page); spin_unlock_irq(&hugetlb_lock); idx++; } cond_resched(); } while (hugetlb_cgroup_have_usage(h_cg)); } static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, enum hugetlb_memory_event event) { atomic_long_inc(&hugetlb->events_local[idx][event]); cgroup_file_notify(&hugetlb->events_local_file[idx]); do { atomic_long_inc(&hugetlb->events[idx][event]); cgroup_file_notify(&hugetlb->events_file[idx]); } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && !hugetlb_cgroup_is_root(hugetlb)); } static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr, bool rsvd) { int ret = 0; struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; if (hugetlb_cgroup_disabled()) goto done; /* * We don't charge any cgroup if the compound page have less * than 3 pages. */ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); if (!page_counter_try_charge( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages, &counter)) { ret = -ENOMEM; hugetlb_event(h_cg, idx, HUGETLB_MAX); css_put(&h_cg->css); goto done; } /* Reservations take a reference to the css because they do not get * reparented. */ if (!rsvd) css_put(&h_cg->css); done: *ptr = h_cg; return ret; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); } int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); } /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; __set_hugetlb_cgroup(page, h_cg, rsvd); return; } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true); } /* * Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); h_cg = __hugetlb_cgroup_from_page(page, rsvd); if (unlikely(!h_cg)) return; __set_hugetlb_cgroup(page, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); return; } void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) { __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); } void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, struct page *page) { __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) return; page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); } void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); } void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end) { if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || !resv->css) return; page_counter_uncharge(resv->reservation_counter, (end - start) * resv->pages_per_hpage); css_put(resv->css); } void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 && !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); /* * Only do css_put(rg->css) when we delete the entire region * because one file_region must hold exactly one css reference. */ if (region_del) css_put(rg->css); } } enum { RES_USAGE, RES_RSVD_USAGE, RES_LIMIT, RES_RSVD_LIMIT, RES_MAX_USAGE, RES_RSVD_MAX_USAGE, RES_FAILCNT, RES_RSVD_FAILCNT, }; static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct page_counter *counter; struct page_counter *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_RSVD_USAGE: return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->max * PAGE_SIZE; case RES_RSVD_LIMIT: return (u64)rsvd_counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_RSVD_MAX_USAGE: return (u64)rsvd_counter->watermark * PAGE_SIZE; case RES_FAILCNT: return counter->failcnt; case RES_RSVD_FAILCNT: return rsvd_counter->failcnt; default: BUG(); } } static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) { int idx; u64 val; struct cftype *cft = seq_cft(seq); unsigned long limit; struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); counter = &h_cg->hugepage[idx]; limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; case RES_RSVD_LIMIT: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_LIMIT: val = (u64)counter->max; if (val == limit) seq_puts(seq, "max\n"); else seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; default: BUG(); } return 0; } static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, const char *max) { int ret, idx; unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); bool rsvd = false; if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ return -EINVAL; buf = strstrip(buf); ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; idx = MEMFILE_IDX(of_cft(of)->private); nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: rsvd = true; fallthrough; case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); ret = page_counter_set_max( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); } static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); } static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret = 0; struct page_counter *counter, *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: page_counter_reset_watermark(counter); break; case RES_RSVD_MAX_USAGE: page_counter_reset_watermark(rsvd_counter); break; case RES_FAILCNT: counter->failcnt = 0; break; case RES_RSVD_FAILCNT: rsvd_counter->failcnt = 0; break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) { if (hsize >= (1UL << 30)) snprintf(buf, size, "%luGB", hsize >> 30); else if (hsize >= (1UL << 20)) snprintf(buf, size, "%luMB", hsize >> 20); else snprintf(buf, size, "%luKB", hsize >> 10); return buf; } static int __hugetlb_events_show(struct seq_file *seq, bool local) { int idx; long max; struct cftype *cft = seq_cft(seq); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); if (local) max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); else max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); seq_printf(seq, "max %lu\n", max); return 0; } static int hugetlb_events_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, false); } static int hugetlb_events_local_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, true); } static void __init __hugetlb_cgroup_file_dfl_init(int idx) { char buf[32]; struct cftype *cft; struct hstate *h = &hstates[idx]; /* format the size */ mem_fmt(buf, sizeof(buf), huge_page_size(h)); /* Add the limit file */ cft = &h->cgroup_files_dfl[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->write = hugetlb_cgroup_write_dfl; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the reservation limit file */ cft = &h->cgroup_files_dfl[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->write = hugetlb_cgroup_write_dfl; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the current usage file */ cft = &h->cgroup_files_dfl[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the current reservation usage file */ cft = &h->cgroup_files_dfl[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the events file */ cft = &h->cgroup_files_dfl[4]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_show; cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the events.local file */ cft = &h->cgroup_files_dfl[5]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_local_show; cft->file_offset = offsetof(struct hugetlb_cgroup, events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; /* NULL terminate the last cft */ cft = &h->cgroup_files_dfl[6]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files_dfl)); } static void __init __hugetlb_cgroup_file_legacy_init(int idx) { char buf[32]; struct cftype *cft; struct hstate *h = &hstates[idx]; /* format the size */ mem_fmt(buf, sizeof(buf), huge_page_size(h)); /* Add the limit file */ cft = &h->cgroup_files_legacy[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; cft->write = hugetlb_cgroup_write_legacy; /* Add the reservation limit file */ cft = &h->cgroup_files_legacy[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; cft->write = hugetlb_cgroup_write_legacy; /* Add the usage file */ cft = &h->cgroup_files_legacy[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the reservation usage file */ cft = &h->cgroup_files_legacy[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX usage file */ cft = &h->cgroup_files_legacy[4]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX reservation usage file */ cft = &h->cgroup_files_legacy[5]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ cft = &h->cgroup_files_legacy[6]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the reservation failcntfile */ cft = &h->cgroup_files_legacy[7]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ cft = &h->cgroup_files_legacy[8]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files_legacy)); } static void __init __hugetlb_cgroup_file_init(int idx) { __hugetlb_cgroup_file_dfl_init(idx); __hugetlb_cgroup_file_legacy_init(idx); } void __init hugetlb_cgroup_file_init(void) { struct hstate *h; for_each_hstate(h) { /* * Add cgroup control files only if the huge page consists * of more than two normal pages. This is because we use * page[2].private for storing cgroup details. */ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) __hugetlb_cgroup_file_init(hstate_index(h)); } } /* * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = page_hstate(oldhpage); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); set_hugetlb_cgroup_rsvd(oldhpage, NULL); /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(newhpage, h_cg); set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; } static struct cftype hugetlb_files[] = { {} /* terminate */ }; struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, .dfl_cftypes = hugetlb_files, .legacy_cftypes = hugetlb_files, };
190 247 820 221 757 244 51 51 91 102 116 16 111 110 111 65 64 31 65 64 505 505 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bug.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/types.h> /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
178 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IPC_NAMESPACE_H__ #define __IPC_NAMESPACE_H__ #include <linux/err.h> #include <linux/idr.h> #include <linux/rwsem.h> #include <linux/notifier.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/refcount.h> #include <linux/rhashtable-types.h> struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; struct rw_semaphore rwsem; struct idr ipcs_idr; int max_idx; int last_idx; /* For wrap around detection */ #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif struct rhashtable key_ht; }; struct ipc_namespace { struct ipc_ids ids[3]; int sem_ctls[4]; int used_sems; unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; atomic_t msg_bytes; atomic_t msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; unsigned long shm_tot; int shm_ctlmni; /* * Defines whether IPC_RMID is forced for _all_ shm segments regardless * of shmctl() */ int shm_rmid_forced; struct notifier_block ipcns_nb; /* The kern_mount of the mqueuefs sb. We take a ref on it */ struct vfsmount *mq_mnt; /* # queues in this ns, protected by mq_lock */ unsigned int mq_queues_count; /* next fields are set through sysctl */ unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ unsigned int mq_msg_default; unsigned int mq_msgsize_default; /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ucounts *ucounts; struct llist_node mnt_llist; struct ns_common ns; } __randomize_layout; extern struct ipc_namespace init_ipc_ns; extern spinlock_t mq_lock; #ifdef CONFIG_SYSVIPC extern void shm_destroy_orphaned(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC */ static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #endif /* CONFIG_SYSVIPC */ #ifdef CONFIG_POSIX_MQUEUE extern int mq_init_ns(struct ipc_namespace *ns); /* * POSIX Message Queue default values: * * MIN_*: Lowest value an admin can set the maximum unprivileged limit to * DFLT_*MAX: Default values for the maximum unprivileged limits * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply * an attribute to the open call and the queue must be created * HARD_*: Highest value the maximums can be set to. These are enforced * on CAP_SYS_RESOURCE apps as well making them inviolate (so make them * suitably high) * * POSIX Requirements: * Per app minimum openable message queues - 8. This does not map well * to the fact that we limit the number of queues on a per namespace * basis instead of a per app basis. So, make the default high enough * that no given app should have a hard time opening 8 queues. * Minimum maximum for HARD_MSGMAX - 32767. I bumped this to 65536. * Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this. However, * we have run into a situation where running applications in the wild * require this to be at least 5MB, and preferably 10MB, so I set the * value to 16MB in hopes that this user is the worst of the bunch and * the new maximum will handle anyone else. I may have to revisit this * in the future. */ #define DFLT_QUEUESMAX 256 #define MIN_MSGMAX 1 #define DFLT_MSG 10U #define DFLT_MSGMAX 10 #define HARD_MSGMAX 65536 #define MIN_MSGSIZEMAX 128 #define DFLT_MSGSIZE 8192U #define DFLT_MSGSIZEMAX 8192 #define HARD_MSGSIZEMAX (16*1024*1024) #else static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { if (refcount_inc_not_zero(&ns->ns.count)) return ns; } return NULL; } extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) return ERR_PTR(-EINVAL); return ns; } static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { return ns; } static inline void put_ipc_ns(struct ipc_namespace *ns) { } #endif #ifdef CONFIG_POSIX_MQUEUE_SYSCTL struct ctl_table_header; extern struct ctl_table_header *mq_register_sysctl_table(void); #else /* CONFIG_POSIX_MQUEUE_SYSCTL */ static inline struct ctl_table_header *mq_register_sysctl_table(void) { return NULL; } #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */ #endif
58 58 57 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip6_flowlabel.c IPv6 flowlabel manager. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> #include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/rawv6.h> #include <net/transp_v6.h> #include <linux/uaccess.h> #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ #define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ #define FL_MAX_PER_SOCK 32 #define FL_MAX_SIZE 4096 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_SPINLOCK(ip6_sk_fl_lock); DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ fl != NULL; \ fl = rcu_dereference_bh(fl->next)) #define for_each_fl_continue_rcu(fl) \ for (fl = rcu_dereference_bh(fl->next); \ fl != NULL; \ fl = rcu_dereference_bh(fl->next)) #define for_each_sk_fl_rcu(np, sfl) \ for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference_bh(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; rcu_read_lock_bh(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; rcu_read_unlock_bh(); return fl; } static bool fl_shared_exclusive(struct ip6_flowlabel *fl) { return fl->share == IPV6_FL_S_EXCL || fl->share == IPV6_FL_S_PROCESS || fl->share == IPV6_FL_S_USER; } static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); kfree(fl); } static void fl_free(struct ip6_flowlabel *fl) { if (!fl) return; if (fl_shared_exclusive(fl) || fl->opt) static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (fl->opt && fl->share == IPV6_FL_S_EXCL) { struct ipv6_txoptions *opt = fl->opt; fl->opt = NULL; kfree(opt); } if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; unsigned long sched = 0; spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) sched = ttd; } flp = &fl->next; } } if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } flp = &fl->next; } } spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) break; } } } else { /* * we dropper the ip6_fl_lock, so this entry could reappear * and we need to recheck with it. * * OTOH no need to search the active socket first, like it is * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); return NULL; } /* Socket flowlabel lists */ struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock_bh(); for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; rcu_read_unlock_bh(); return fl; } } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (!rcu_access_pointer(np->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); spin_lock_bh(&ip6_sk_fl_lock); } spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ /* It is the only difficult place. flowlabel enforces equal headers before and including routing header, however user may supply options following rthdr. */ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt) { struct ipv6_txoptions *fl_opt = fl->opt; if (!fopt || fopt->opt_flen == 0) return fl_opt; if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; } else { if (fopt->opt_nflen == 0) return fopt; opt_space->hopopt = NULL; opt_space->dst0opt = NULL; opt_space->srcrt = NULL; opt_space->opt_nflen = 0; } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { if (ttl < FL_MIN_LINGER) return FL_MIN_LINGER*HZ; if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) return 0; return ttl*HZ; } static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) { linger = check_linger(linger); if (!linger) return -EPERM; expires = check_linger(expires); if (!expires) return -EPERM; spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; if (time_before(expires, fl->linger)) expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; spin_unlock_bh(&ip6_fl_lock); return 0; } static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; olen = optlen - CMSG_ALIGN(sizeof(*freq)); err = -EINVAL; if (olen > 64 * 1024) goto done; err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); if (!fl) goto done; if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; if (copy_from_sockptr_offset(fl->opt + 1, optval, CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; if (fl->opt->opt_flen) goto done; if (fl->opt->opt_nflen == 0) { kfree(fl->opt); fl->opt = NULL; } } fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); if ((addr_type & IPV6_ADDR_MAPPED) || addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: fl->owner.uid = current_euid(); break; default: err = -EINVAL; goto done; } if (fl_shared_exclusive(fl) || fl->opt) { WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); } return fl; done: if (fl) { kfree(fl->opt); kfree(fl); } *err_p = err; return NULL; } static int mem_check(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock_bh(); for_each_sk_fl_rcu(np, sfl) count++; rcu_read_unlock_bh(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = np->ipv6_fl_list; rcu_assign_pointer(np->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (flags & IPV6_FL_F_REMOTE) { freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; } if (np->repflow) { freq->flr_label = np->flow_label; return 0; } rcu_read_lock_bh(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; freq->flr_dst = sfl->fl->dst; freq->flr_share = sfl->fl->share; freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock_bh(); return 0; } } rcu_read_unlock_bh(); return -ENOENT; } #define socklist_dereference(__sflp) \ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist __rcu **sflp; struct ipv6_fl_socklist *sfl; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (!np->repflow) return -ESRCH; np->flow_label = 0; np->repflow = 0; return 0; } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) goto found; } spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; found: if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); return 0; } static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock_bh(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); rcu_read_unlock_bh(); return err; } } rcu_read_unlock_bh(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); if (fl) { err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); fl_release(fl); return err; } } return -ESRCH; } static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen) { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (net->ipv6.sysctl.flowlabel_consistency) { net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); return -EPERM; } if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; np->repflow = 1; return 0; } if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; if (net->ipv6.sysctl.flowlabel_state_ranges && (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) return -ERANGE; fl = fl_create(net, sk, freq, optval, optlen, &err); if (!fl) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); if (freq->flr_label) { err = -EEXIST; rcu_read_lock_bh(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock_bh(); goto done; } fl1 = sfl->fl; if (!atomic_inc_not_zero(&fl1->users)) fl1 = NULL; break; } } rcu_read_unlock_bh(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); if (fl1) { recheck: err = -EEXIST; if (freq->flr_flags&IPV6_FL_F_EXCL) goto release; err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; fl_link(np, sfl1, fl1); fl_free(fl); return 0; release: fl_release(fl1); goto done; } } err = -ENOENT; if (!(freq->flr_flags & IPV6_FL_F_CREATE)) goto done; err = -ENOMEM; if (!sfl1) goto done; err = mem_check(sk); if (err != 0) goto done; fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; if (!freq->flr_label) { size_t offset = offsetof(struct in6_flowlabel_req, flr_label); if (copy_to_sockptr_offset(optval, offset, &fl->label, sizeof(fl->label))) { /* Intentionally ignore fault. */ } } fl_link(np, sfl1, fl); return 0; done: fl_free(fl); kfree(sfl1); return err; } int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) { struct in6_flowlabel_req freq; if (optlen < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; switch (freq.flr_action) { case IPV6_FL_A_PUT: return ipv6_flowlabel_put(sk, &freq); case IPV6_FL_A_RENEW: return ipv6_flowlabel_renew(sk, &freq); case IPV6_FL_A_GET: return ipv6_flowlabel_get(sk, &freq, optval, optlen); default: return -EINVAL; } } #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { struct seq_net_private p; struct pid_namespace *pid_ns; int bucket; }; #define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for_each_fl_continue_rcu(fl) { if (net_eq(fl->fl_net, net)) goto out; } try_again: if (++state->bucket <= FL_HASH_MASK) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } goto try_again; } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_flowlabel *fl = ip6fl_get_first(seq); if (fl) while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) --pos; return pos ? NULL : fl; } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_flowlabel *fl; if (v == SEQ_START_TOKEN) fl = ip6fl_get_first(seq); else fl = ip6fl_get_next(seq, v); ++*pos; return fl; } static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock_bh(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", (unsigned int)ntohl(fl->label), fl->share, ((fl->share == IPV6_FL_S_PROCESS) ? pid_nr_ns(fl->owner.pid, state->pid_ns) : ((fl->share == IPV6_FL_S_USER) ? from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; } static const struct seq_operations ip6fl_seq_ops = { .start = ip6fl_seq_start, .next = ip6fl_seq_next, .stop = ip6fl_seq_stop, .show = ip6fl_seq_show, }; static int __net_init ip6_flowlabel_proc_init(struct net *net) { if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state))) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) { return 0; } static inline void ip6_flowlabel_proc_fini(struct net *net) { } #endif static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); } static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, }; int ip6_flowlabel_init(void) { return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); }
30 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook */ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ #include <linux/list.h> #include <linux/spinlock_types.h> #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) #define NR_BPF_LRU_LOCAL_LIST_T (2) #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T enum bpf_lru_list_type { BPF_LRU_LIST_T_ACTIVE, BPF_LRU_LIST_T_INACTIVE, BPF_LRU_LIST_T_FREE, BPF_LRU_LOCAL_LIST_T_FREE, BPF_LRU_LOCAL_LIST_T_PENDING, }; struct bpf_lru_node { struct list_head list; u16 cpu; u8 type; u8 ref; }; struct bpf_lru_list { struct list_head lists[NR_BPF_LRU_LIST_T]; unsigned int counts[NR_BPF_LRU_LIST_COUNT]; /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; raw_spinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; u16 next_steal; raw_spinlock_t lock; }; struct bpf_common_lru { struct bpf_lru_list lru_list; struct bpf_lru_locallist __percpu *local_list; }; typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); struct bpf_lru { union { struct bpf_common_lru common_lru; struct bpf_lru_list __percpu *percpu_lru; }; del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; unsigned int nr_scans; bool percpu; }; static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) { if (!READ_ONCE(node->ref)) WRITE_ONCE(node->ref, 1); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *delete_arg); void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems); void bpf_lru_destroy(struct bpf_lru *lru); struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); #endif
44 12 31 33 33 21 21 21 2 19 2 1 2 1 1 161 19 151 143 10 90 8 3 18 76 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <linux/seq_file.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; struct icmp6hdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->icmp6_type; tuple->src.u.icmp.id = hp->icmp6_identifier; tuple->dst.u.icmp.code = hp->icmp6_code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 }; static const u_int8_t noct_valid_new[] = { [ICMPV6_MGM_QUERY - 130] = 1, [ICMPV6_MGM_REPORT - 130] = 1, [ICMPV6_MGM_REDUCTION - 130] = 1, [NDISC_ROUTER_SOLICITATION - 130] = 1, [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, [ICMPV6_MLD2_REPORT - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } static unsigned int *icmpv6_get_timeouts(struct net *net) { return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u8 valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; if (state->pf != NFPROTO_IPV6) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct)) { int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); return -NF_ACCEPT; } } if (!timeout) timeout = icmpv6_get_timeouts(nf_ct_net(ct)); /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } static noinline_for_stack int nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { u8 hl = ipv6_hdr(skb)->hop_limit; union nf_inet_addr outer_daddr; union { struct nd_opt_hdr nd_opt; struct rd_msg rd_msg; } tmp; const struct nd_opt_hdr *nd_opt; const struct rd_msg *rd_msg; rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); if (!rd_msg) { icmpv6_error_log(skb, state, "short redirect"); return -NF_ACCEPT; } if (rd_msg->icmph.icmp6_code != 0) return NF_ACCEPT; if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); return -NF_ACCEPT; } dataoff += sizeof(*rd_msg); /* warning: rd_msg no longer usable after this call */ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); if (!nd_opt || nd_opt->nd_opt_len == 0) { icmpv6_error_log(skb, state, "redirect without options"); return -NF_ACCEPT; } /* We could call ndisc_parse_options(), but it would need * skb_linearize() and a bit more work. */ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += 8; return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { icmpv6_error_log(skb, state, "short packet"); return -NF_ACCEPT; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) { icmpv6_error_log(skb, state, "ICMPv6 checksum failed"); return -NF_ACCEPT; } type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return NF_ACCEPT; } if (icmp6h->icmp6_type == NDISC_REDIRECT) return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += sizeof(*icmp6h); return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { if (!tb[CTA_PROTO_ICMPV6_TYPE]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); if (tuple->dst.u.icmp.type < 128 || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type - 128]) return -EINVAL; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { if (!tb[CTA_PROTO_ICMPV6_CODE]) return -EINVAL; tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { if (!tb[CTA_PROTO_ICMPV6_ID]) return -EINVAL; tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); } return 0; } static unsigned int icmpv6_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; } else { /* Set default ICMPv6 timeout. */ *timeout = in->timeout; } return 0; } static int icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_icmpv6_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmpv6_pernet(net); in->timeout = nf_ct_icmpv6_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmpv6_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FIRMWARE_LOADER_H #define __FIRMWARE_LOADER_H #include <linux/bitops.h> #include <linux/firmware.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/completion.h> #include <generated/utsrelease.h> /** * enum fw_opt - options to control firmware loading behaviour * * @FW_OPT_UEVENT: Enables the fallback mechanism to send a kobject uevent * when the firmware is not found. Userspace is in charge to load the * firmware using the sysfs loading facility. * @FW_OPT_NOWAIT: Used to describe the firmware request is asynchronous. * @FW_OPT_USERHELPER: Enable the fallback mechanism, in case the direct * filesystem lookup fails at finding the firmware. For details refer to * firmware_fallback_sysfs(). * @FW_OPT_NO_WARN: Quiet, avoid printing warning messages. * @FW_OPT_NOCACHE: Disables firmware caching. Firmware caching is used to * cache the firmware upon suspend, so that upon resume races against the * firmware file lookup on storage is avoided. Used for calls where the * file may be too big, or where the driver takes charge of its own * firmware caching mechanism. * @FW_OPT_NOFALLBACK_SYSFS: Disable the sysfs fallback mechanism. Takes * precedence over &FW_OPT_UEVENT and &FW_OPT_USERHELPER. * @FW_OPT_FALLBACK_PLATFORM: Enable fallback to device fw copy embedded in * the platform's main firmware. If both this fallback and the sysfs * fallback are enabled, then this fallback will be tried first. * @FW_OPT_PARTIAL: Allow partial read of firmware instead of needing to read * entire file. */ enum fw_opt { FW_OPT_UEVENT = BIT(0), FW_OPT_NOWAIT = BIT(1), FW_OPT_USERHELPER = BIT(2), FW_OPT_NO_WARN = BIT(3), FW_OPT_NOCACHE = BIT(4), FW_OPT_NOFALLBACK_SYSFS = BIT(5), FW_OPT_FALLBACK_PLATFORM = BIT(6), FW_OPT_PARTIAL = BIT(7), }; enum fw_status { FW_STATUS_UNKNOWN, FW_STATUS_LOADING, FW_STATUS_DONE, FW_STATUS_ABORTED, }; /* * Concurrent request_firmware() for the same firmware need to be * serialized. struct fw_state is simple state machine which hold the * state of the firmware loading. */ struct fw_state { struct completion completion; enum fw_status status; }; struct fw_priv { struct kref ref; struct list_head list; struct firmware_cache *fwc; struct fw_state fw_st; void *data; size_t size; size_t allocated_size; size_t offset; u32 opt_flags; #ifdef CONFIG_FW_LOADER_PAGED_BUF bool is_paged_buf; struct page **pages; int nr_pages; int page_array_size; #endif #ifdef CONFIG_FW_LOADER_USER_HELPER bool need_uevent; struct list_head pending_list; #endif const char *fw_name; }; extern struct mutex fw_lock; static inline bool __fw_state_check(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; return fw_st->status == status; } static inline int __fw_state_wait_common(struct fw_priv *fw_priv, long timeout) { struct fw_state *fw_st = &fw_priv->fw_st; long ret; ret = wait_for_completion_killable_timeout(&fw_st->completion, timeout); if (ret != 0 && fw_st->status == FW_STATUS_ABORTED) return -ENOENT; if (!ret) return -ETIMEDOUT; return ret < 0 ? ret : 0; } static inline void __fw_state_set(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; WRITE_ONCE(fw_st->status, status); if (status == FW_STATUS_DONE || status == FW_STATUS_ABORTED) { #ifdef CONFIG_FW_LOADER_USER_HELPER /* * Doing this here ensures that the fw_priv is deleted from * the pending list in all abort/done paths. */ list_del_init(&fw_priv->pending_list); #endif complete_all(&fw_st->completion); } } static inline void fw_state_aborted(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_ABORTED); } static inline bool fw_state_is_aborted(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_ABORTED); } static inline void fw_state_start(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_LOADING); } static inline void fw_state_done(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_DONE); } int assign_fw(struct firmware *fw, struct device *device); #ifdef CONFIG_FW_LOADER_PAGED_BUF void fw_free_paged_buf(struct fw_priv *fw_priv); int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed); int fw_map_paged_buf(struct fw_priv *fw_priv); bool fw_is_paged_buf(struct fw_priv *fw_priv); #else static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {} static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { return -ENXIO; } static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; } static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; } #endif #endif /* __FIRMWARE_LOADER_H */
54 2 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel reference Implementation * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines, Corp. * * This file is part of the SCTP kernel reference Implementation * * SCTP Checksum functions * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Dinakaran Joseph * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * * Rewritten to use libcrc32c by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ #ifndef __sctp_checksum_h__ #define __sctp_checksum_h__ #include <linux/types.h> #include <net/sctp/sctp.h> #include <linux/crc32c.h> #include <linux/crc32.h> static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) { /* This uses the crypto implementation of crc32c, which is either * implemented w/ hardware support or resolves to __crc32c_le(). */ return (__force __wsum)crc32c((__force __u32)sum, buff, len); } static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, int offset, int len) { return (__force __wsum)__crc32c_le_combine((__force __u32)csum, (__force __u32)csum2, len); } static const struct skb_checksum_ops sctp_csum_ops = { .update = sctp_csum_update, .combine = sctp_csum_combine, }; static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; __wsum new; sh->checksum = 0; new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, &sctp_csum_ops); sh->checksum = old; return cpu_to_le32((__force __u32)new); } #endif /* __sctp_checksum_h__ */
303 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Author: Mimi Zohar <zohar@us.ibm.com> * * File: ima_api.c * Implements must_appraise_or_measure, collect_measurement, * appraise_measurement, store_measurement and store_template. */ #include <linux/slab.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/evm.h> #include <linux/iversion.h> #include "ima.h" /* * ima_free_template_entry - free an existing template entry */ void ima_free_template_entry(struct ima_template_entry *entry) { int i; for (i = 0; i < entry->template_desc->num_fields; i++) kfree(entry->template_data[i].data); kfree(entry->digests); kfree(entry); } /* * ima_alloc_init_template - create and initialize a new template entry */ int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_entry **entry, struct ima_template_desc *desc) { struct ima_template_desc *template_desc; struct tpm_digest *digests; int i, result = 0; if (desc) template_desc = desc; else template_desc = ima_template_desc_current(); *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); *entry = NULL; return -ENOMEM; } (*entry)->digests = digests; (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { const struct ima_template_field *field = template_desc->fields[i]; u32 len; result = field->field_init(event_data, &((*entry)->template_data[i])); if (result != 0) goto out; len = (*entry)->template_data[i].len; (*entry)->template_data_len += sizeof(len); (*entry)->template_data_len += len; } return 0; out: ima_free_template_entry(*entry); *entry = NULL; return result; } /* * ima_store_template - store ima template measurements * * Calculate the hash of a template entry, add the template entry * to an ordered list of measurement entries maintained inside the kernel, * and also update the aggregate integrity value (maintained inside the * configured TPM PCR) over the hashes of the current list of measurement * entries. * * Applications retrieve the current kernel-held measurement list through * the securityfs entries in /sys/kernel/security/ima. The signed aggregate * TPM PCR (called quote) can be retrieved using a TPM user space library * and is used to validate the measurement list. * * Returns 0 on success, error code otherwise */ int ima_store_template(struct ima_template_entry *entry, int violation, struct inode *inode, const unsigned char *filename, int pcr) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "hashing_error"; char *template_name = entry->template_desc->name; int result; if (!violation) { result = ima_calc_field_array_hash(&entry->template_data[0], entry); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, template_name, op, audit_cause, result, 0); return result; } } entry->pcr = pcr; result = ima_add_template_entry(entry, violation, op, inode, filename); return result; } /* * ima_add_violation - add violation to measurement list. * * Violations are flagged in the measurement list with zero hash values. * By extending the PCR with 0xFF's instead of with zeroes, the PCR * value is invalidated. */ void ima_add_violation(struct file *file, const unsigned char *filename, struct integrity_iint_cache *iint, const char *op, const char *cause) { struct ima_template_entry *entry; struct inode *inode = file_inode(file); struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .violation = cause }; int violation = 1; int result; /* can overflow, only indicator */ atomic_long_inc(&ima_htable.violations); result = ima_alloc_init_template(&event_data, &entry, NULL); if (result < 0) { result = -ENOMEM; goto err_out; } result = ima_store_template(entry, violation, inode, filename, CONFIG_IMA_MEASURE_PCR_IDX); if (result < 0) ima_free_template_entry(entry); err_out: integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, cause, result, 0); } /** * ima_get_action - appraise & measure decision based on policy. * @mnt_userns: user namespace of the mount the inode was found from * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate * @secid: secid of the task being validated * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC, * MAY_APPEND) * @func: caller identifier * @pcr: pointer filled in if matched measure policy sets pcr= * @template_desc: pointer filled in if matched measure policy sets template= * @func_data: func specific data, may be NULL * @allowed_algos: allowlist of hash algorithms for the IMA xattr * * The policy is defined in terms of keypairs: * subj=, obj=, type=, func=, mask=, fsmagic= * subj,obj, and type: are LSM specific. * func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK * | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA * mask: contains the permission mask * fsmagic: hex value * * Returns IMA_MEASURE, IMA_APPRAISE mask. * */ int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) { int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH; flags &= ima_policy_flag; return ima_match_policy(mnt_userns, inode, cred, secid, func, mask, flags, pcr, template_desc, func_data, allowed_algos); } /* * ima_collect_measurement - collect file measurement * * Calculate the file hash, if it doesn't already exist, * storing the measurement and i_version in the iint. * * Must be called with iint->mutex held. * * Return 0 on success, error code otherwise */ int ima_collect_measurement(struct integrity_iint_cache *iint, struct file *file, void *buf, loff_t size, enum hash_algo algo, struct modsig *modsig) { const char *audit_cause = "failed"; struct inode *inode = file_inode(file); struct inode *real_inode = d_real_inode(file_dentry(file)); struct name_snapshot filename; int result = 0; int length; void *tmpbuf; u64 i_version; struct { struct ima_digest_data hdr; char digest[IMA_MAX_DIGEST_SIZE]; } hash; /* * Always collect the modsig, because IMA might have already collected * the file digest without collecting the modsig in a previous * measurement rule. */ if (modsig) ima_collect_modsig(modsig, buf, size); if (iint->flags & IMA_COLLECTED) goto out; /* * Dectecting file change is based on i_version. On filesystems * which do not support i_version, support is limited to an initial * measurement/appraisal/audit. */ i_version = inode_query_iversion(inode); hash.hdr.algo = algo; /* Initialize hash digest to 0's in case of failure */ memset(&hash.digest, 0, sizeof(hash.digest)); if (buf) result = ima_calc_buffer_hash(buf, size, &hash.hdr); else result = ima_calc_file_hash(file, &hash.hdr); if (result && result != -EBADF && result != -EINVAL) goto out; length = sizeof(hash.hdr) + hash.hdr.length; tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS); if (!tmpbuf) { result = -ENOMEM; goto out; } iint->ima_hash = tmpbuf; memcpy(iint->ima_hash, &hash, length); iint->version = i_version; if (real_inode != inode) { iint->real_ino = real_inode->i_ino; iint->real_dev = real_inode->i_sb->s_dev; } /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; out: if (result) { if (file->f_flags & O_DIRECT) audit_cause = "failed(directio)"; take_dentry_name_snapshot(&filename, file->f_path.dentry); integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename.name.name, "collect_data", audit_cause, result, 0); release_dentry_name_snapshot(&filename); } return result; } /* * ima_store_measurement - store file measurement * * Create an "ima" template and then store the template by calling * ima_store_template. * * We only get here if the inode has not already been measured, * but the measurement could already exist: * - multiple copies of the same file on either the same or * different filesystems. * - the inode was previously flushed as well as the iint info, * containing the hashing info. * * Must be called with iint->mutex held. */ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "ENOMEM"; int result = -ENOMEM; struct inode *inode = file_inode(file); struct ima_template_entry *entry; struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .xattr_value = xattr_value, .xattr_len = xattr_len, .modsig = modsig }; int violation = 0; /* * We still need to store the measurement in the case of MODSIG because * we only have its contents to put in the list at the time of * appraisal, but a file measurement from earlier might already exist in * the measurement list. */ if (iint->measured_pcrs & (0x1 << pcr) && !modsig) return; result = ima_alloc_init_template(&event_data, &entry, template_desc); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, 0); return; } result = ima_store_template(entry, violation, inode, filename, pcr); if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) { iint->flags |= IMA_MEASURED; iint->measured_pcrs |= (0x1 << pcr); } if (result < 0) ima_free_template_entry(entry); } void ima_audit_measurement(struct integrity_iint_cache *iint, const unsigned char *filename) { struct audit_buffer *ab; char *hash; const char *algo_name = hash_algo_name[iint->ima_hash->algo]; int i; if (iint->flags & IMA_AUDITED) return; hash = kzalloc((iint->ima_hash->length * 2) + 1, GFP_KERNEL); if (!hash) return; for (i = 0; i < iint->ima_hash->length; i++) hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]); hash[i * 2] = '\0'; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_INTEGRITY_RULE); if (!ab) goto out; audit_log_format(ab, "file="); audit_log_untrustedstring(ab, filename); audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash); audit_log_task_info(ab); audit_log_end(ab); iint->flags |= IMA_AUDITED; out: kfree(hash); return; } /* * ima_d_path - return a pointer to the full pathname * * Attempt to return a pointer to the full pathname for use in the * IMA measurement list, IMA audit records, and auditing logs. * * On failure, return a pointer to a copy of the filename, not dname. * Returning a pointer to dname, could result in using the pointer * after the memory has been freed. */ const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf) { struct name_snapshot filename; char *pathname = NULL; *pathbuf = __getname(); if (*pathbuf) { pathname = d_absolute_path(path, *pathbuf, PATH_MAX); if (IS_ERR(pathname)) { __putname(*pathbuf); *pathbuf = NULL; pathname = NULL; } } if (!pathname) { take_dentry_name_snapshot(&filename, path->dentry); strscpy(namebuf, filename.name.name, NAME_MAX); release_dentry_name_snapshot(&filename); pathname = namebuf; } return pathname; }
47 47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/netdevice.h> #include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #ifdef CONFIG_LWTUNNEL #include <net/netfilter/nf_hooks_lwtunnel.h> #endif #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; MODULE_PARM_DESC(enable_hooks, "Always enable conntrack hooks"); module_param(enable_hooks, bool, 0000); unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) { switch (tuple->src.l3num) { case NFPROTO_IPV4: seq_printf(s, "src=%pI4 dst=%pI4 ", &tuple->src.u3.ip, &tuple->dst.u3.ip); break; case NFPROTO_IPV6: seq_printf(s, "src=%pI6 dst=%pI6 ", tuple->src.u3.ip6, tuple->dst.u3.ip6); break; default: break; } switch (l4proto->l4proto) { case IPPROTO_ICMP: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_TCP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), ntohs(tuple->dst.u.udp.port)); break; case IPPROTO_DCCP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.dccp.port), ntohs(tuple->dst.u.dccp.port)); break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), ntohs(tuple->dst.u.sctp.port)); break; case IPPROTO_ICMPV6: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_GRE: seq_printf(s, "srckey=0x%x dstkey=0x%x ", ntohs(tuple->src.u.gre.key), ntohs(tuple->dst.u.gre.key)); break; default: break; } } EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; unsigned int bucket; u_int64_t time_now; }; static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; st->bucket < st->htable_size; st->bucket++) { n = rcu_dereference( hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } return NULL; } static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_nulls_node *head = ct_get_first(seq); if (head) while (pos && (head = ct_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return ct_get_next(s, v); } static void ct_seq_stop(struct seq_file *s, void *v) __releases(RCU) { rcu_read_unlock(); } #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { int ret; u32 len; char *secctx; ret = security_secid_to_secctx(ct->secmark, &secctx, &len); if (ret) return; seq_printf(s, "secctx=%s ", secctx); security_release_secctx(secctx, len); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { } #endif #ifdef CONFIG_NF_CONNTRACK_ZONES static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); if (zone->dir != dir) return; switch (zone->dir) { case NF_CT_DEFAULT_ZONE_DIR: seq_printf(s, "zone=%u ", zone->id); break; case NF_CT_ZONE_DIR_ORIG: seq_printf(s, "zone-orig=%u ", zone->id); break; case NF_CT_ZONE_DIR_REPL: seq_printf(s, "zone-reply=%u ", zone->id); break; default: break; } } #else static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { } #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { struct ct_iter_state *st = s->private; struct nf_conn_tstamp *tstamp; s64 delta_time; tstamp = nf_conn_tstamp_find(ct); if (tstamp) { delta_time = st->time_now - tstamp->start; if (delta_time > 0) delta_time = div_s64(delta_time, NSEC_PER_SEC); else delta_time = 0; seq_printf(s, "delta-time=%llu ", (unsigned long long)delta_time); } return; } #else static inline void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { } #endif static const char* l3proto_name(u16 proto) { switch (proto) { case AF_INET: return "ipv4"; case AF_INET6: return "ipv6"; } return "unknown"; } static const char* l4proto_name(u16 proto) { switch (proto) { case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; case IPPROTO_DCCP: return "dccp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; case IPPROTO_ICMPV6: return "icmpv6"; } return "unknown"; } static unsigned int seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { struct nf_conn_acct *acct; struct nf_conn_counter *counter; acct = nf_conn_acct_find(ct); if (!acct) return 0; counter = acct->counter; seq_printf(s, "packets=%llu bytes=%llu ", (unsigned long long)atomic64_read(&counter[dir].packets), (unsigned long long)atomic64_read(&counter[dir].bytes)); return 0; } /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); const struct nf_conntrack_l4proto *l4proto; struct net *net = seq_file_net(s); int ret = 0; WARN_ON(!ct); if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; if (nf_ct_should_gc(ct)) { nf_ct_kill(ct); goto release; } /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) goto release; if (!net_eq(nf_ct_net(ct), net)) goto release; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct), l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) l4proto->print_conntrack(s, ct); print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); if (seq_has_overflowed(s)) goto release; if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) goto release; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[HW_OFFLOAD] "); else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); if (seq_has_overflowed(s)) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; ret = 0; release: nf_ct_put(ct); return ret; } static const struct seq_operations ct_seq_ops = { .start = ct_seq_start, .next = ct_seq_next, .stop = ct_seq_stop, .show = ct_seq_show }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } return NULL; } static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } (*pos)++; return NULL; } static void ct_cpu_seq_stop(struct seq_file *seq, void *v) { } static int ct_cpu_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); const struct ip_conntrack_stat *st = v; unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } nr_conntracks = nf_conntrack_count(net); seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, st->clash_resolve, st->found, 0, st->invalid, 0, 0, st->chaintoolong, st->insert, st->insert_failed, st->drop, st->early_drop, st->error, st->expect_new, st->expect_create, st->expect_delete, st->search_restart ); return 0; } static const struct seq_operations ct_cpu_seq_ops = { .start = ct_cpu_seq_start, .next = ct_cpu_seq_next, .stop = ct_cpu_seq_stop, .show = ct_cpu_seq_show, }; static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; kuid_t root_uid; kgid_t root_gid; pde = proc_create_net("nf_conntrack", 0440, net->proc_net, &ct_seq_ops, sizeof(struct ct_iter_state)); if (!pde) goto out_nf_conntrack; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(pde, root_uid, root_gid); pde = proc_create_net("nf_conntrack", 0444, net->proc_net_stat, &ct_cpu_seq_ops, sizeof(struct seq_net_private)); if (!pde) goto out_stat_nf_conntrack; return 0; out_stat_nf_conntrack: remove_proc_entry("nf_conntrack", net->proc_net); out_nf_conntrack: return -ENOMEM; } static void nf_conntrack_standalone_fini_proc(struct net *net) { remove_proc_entry("nf_conntrack", net->proc_net_stat); remove_proc_entry("nf_conntrack", net->proc_net); } #else static int nf_conntrack_standalone_init_proc(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_proc(struct net *net) { } #endif /* CONFIG_NF_CONNTRACK_PROCFS */ u32 nf_conntrack_count(const struct net *net) { const struct nf_conntrack_net *cnet = nf_ct_pernet(net); return atomic_read(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_count); /* Sysctl support */ #ifdef CONFIG_SYSCTL /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* module_param hashsize could have changed value */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; /* update ret, we might not be able to satisfy request */ ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user); /* update it to the actual value used by conntrack */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; return ret; } static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { NF_SYSCTL_CT_MAX, NF_SYSCTL_CT_COUNT, NF_SYSCTL_CT_BUCKETS, NF_SYSCTL_CT_CHECKSUM, NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, NF_SYSCTL_CT_HELPER, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_SYSCTL_CT_TIMESTAMP, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT, NF_SYSCTL_CT_PROTO_DCCP_LOOSE, #endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif #ifdef CONFIG_LWTUNNEL NF_SYSCTL_CT_LWTUNNEL, #endif __NF_SYSCTL_CT_LAST_SYSCTL, }; #define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_BUCKETS] = { .procname = "nf_conntrack_buckets", .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = nf_conntrack_hash_sysctl, }, [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_HELPER] = { .procname = "nf_conntrack_helper", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { .procname = "nf_conntrack_generic_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT] = { .procname = "nf_conntrack_tcp_timeout_syn_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV] = { .procname = "nf_conntrack_tcp_timeout_syn_recv", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED] = { .procname = "nf_conntrack_tcp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT] = { .procname = "nf_conntrack_tcp_timeout_fin_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT] = { .procname = "nf_conntrack_tcp_timeout_close_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK] = { .procname = "nf_conntrack_tcp_timeout_last_ack", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT] = { .procname = "nf_conntrack_tcp_timeout_time_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE] = { .procname = "nf_conntrack_tcp_timeout_close", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS] = { .procname = "nf_conntrack_tcp_timeout_max_retrans", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK] = { .procname = "nf_conntrack_tcp_timeout_unacknowledged", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { .procname = "nf_flowtable_tcp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = { .procname = "nf_conntrack_tcp_ignore_invalid_rst", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { .procname = "nf_conntrack_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM] = { .procname = "nf_conntrack_udp_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { .procname = "nf_flowtable_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6] = { .procname = "nf_conntrack_icmpv6_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_NF_CT_PROTO_SCTP [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED] = { .procname = "nf_conntrack_sctp_timeout_closed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT] = { .procname = "nf_conntrack_sctp_timeout_cookie_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED] = { .procname = "nf_conntrack_sctp_timeout_cookie_echoed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED] = { .procname = "nf_conntrack_sctp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .procname = "nf_conntrack_sctp_timeout_shutdown_recd", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { .procname = "nf_conntrack_dccp_timeout_request", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = { .procname = "nf_conntrack_dccp_timeout_respond", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = { .procname = "nf_conntrack_dccp_timeout_partopen", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = { .procname = "nf_conntrack_dccp_timeout_open", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = { .procname = "nf_conntrack_dccp_timeout_closereq", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = { .procname = "nf_conntrack_dccp_timeout_closing", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = { .procname = "nf_conntrack_dccp_timeout_timewait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { .procname = "nf_conntrack_dccp_loose", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { .procname = "nf_conntrack_gre_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM] = { .procname = "nf_conntrack_gre_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif #ifdef CONFIG_LWTUNNEL [NF_SYSCTL_CT_LWTUNNEL] = { .procname = "nf_hooks_lwtunnel", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = nf_hooks_lwtunnel_sysctl_handler, }, #endif {} }; static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, struct ctl_table *table) { struct nf_tcp_net *tn = nf_tcp_pernet(net); #define XASSIGN(XNAME, tn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ ## XNAME].data = \ &(tn)->timeouts[TCP_CONNTRACK_ ## XNAME] XASSIGN(SYN_SENT, tn); XASSIGN(SYN_RECV, tn); XASSIGN(ESTABLISHED, tn); XASSIGN(FIN_WAIT, tn); XASSIGN(CLOSE_WAIT, tn); XASSIGN(LAST_ACK, tn); XASSIGN(TIME_WAIT, tn); XASSIGN(CLOSE, tn); XASSIGN(RETRANS, tn); XASSIGN(UNACK, tn); #undef XASSIGN #define XASSIGN(XNAME, rval) \ table[NF_SYSCTL_CT_PROTO_TCP_ ## XNAME].data = (rval) XASSIGN(LOOSE, &tn->tcp_loose); XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst); #undef XASSIGN #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; #endif } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net *sn = nf_sctp_pernet(net); #define XASSIGN(XNAME, sn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ ## XNAME].data = \ &(sn)->timeouts[SCTP_CONNTRACK_ ## XNAME] XASSIGN(CLOSED, sn); XASSIGN(COOKIE_WAIT, sn); XASSIGN(COOKIE_ECHOED, sn); XASSIGN(ESTABLISHED, sn); XASSIGN(SHUTDOWN_SENT, sn); XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); #undef XASSIGN #endif } static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_DCCP struct nf_dccp_net *dn = nf_dccp_pernet(net); #define XASSIGN(XNAME, dn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \ &(dn)->dccp_timeout[CT_DCCP_ ## XNAME] XASSIGN(REQUEST, dn); XASSIGN(RESPOND, dn); XASSIGN(PARTOPEN, dn); XASSIGN(OPEN, dn); XASSIGN(CLOSEREQ, dn); XASSIGN(CLOSING, dn); XASSIGN(TIMEWAIT, dn); #undef XASSIGN table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose; #endif } static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_GRE struct nf_gre_net *gn = nf_gre_pernet(net); table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE].data = &gn->timeouts[GRE_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM].data = &gn->timeouts[GRE_CT_REPLIED]; #endif } static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL); table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), GFP_KERNEL); if (!table) return -ENOMEM; table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp; #endif table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; #endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't allow non-init_net ns to alter global sysctls */ if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!cnet->sysctl_header) goto out_unregister_netfilter; return 0; out_unregister_netfilter: kfree(table); return -ENOMEM; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); kfree(table); } #else static int nf_conntrack_standalone_init_sysctl(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { } #endif /* CONFIG_SYSCTL */ static void nf_conntrack_fini_net(struct net *net) { if (enable_hooks) nf_ct_netns_put(net, NFPROTO_INET); nf_conntrack_standalone_fini_proc(net); nf_conntrack_standalone_fini_sysctl(net); } static int nf_conntrack_pernet_init(struct net *net) { int ret; net->ct.sysctl_checksum = 1; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) return ret; ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; ret = nf_conntrack_init_net(net); if (ret < 0) goto out_init_net; if (enable_hooks) { ret = nf_ct_netns_get(net, NFPROTO_INET); if (ret < 0) goto out_hooks; } return 0; out_hooks: nf_conntrack_cleanup_net(net); out_init_net: nf_conntrack_standalone_fini_proc(net); out_proc: nf_conntrack_standalone_fini_sysctl(net); return ret; } static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) nf_conntrack_fini_net(net); nf_conntrack_cleanup_net_list(net_exit_list); } static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, .id = &nf_conntrack_net_id, .size = sizeof(struct nf_conntrack_net), }; static int __init nf_conntrack_standalone_init(void) { int ret = nf_conntrack_init_start(); if (ret < 0) goto out_start; BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); #ifdef CONFIG_SYSCTL nf_ct_netfilter_header = register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); if (!nf_ct_netfilter_header) { pr_err("nf_conntrack: can't register to sysctl.\n"); ret = -ENOMEM; goto out_sysctl; } nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif nf_conntrack_init_end(); ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; return 0; out_pernet: #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); out_sysctl: #endif nf_conntrack_cleanup_end(); out_start: return ret; } static void __exit nf_conntrack_standalone_fini(void) { nf_conntrack_cleanup_start(); unregister_pernet_subsys(&nf_conntrack_net_ops); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); #endif nf_conntrack_cleanup_end(); } module_init(nf_conntrack_standalone_init); module_exit(nf_conntrack_standalone_fini);
286 9 66 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (skb->napi_id < MIN_NAPI_ID) skb->napi_id = napi->napi_id; #endif } /* used in the protocol hanlder to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, skb->napi_id); #endif } static inline void sk_mark_napi_id_once_xdp(struct sock *sk, const struct xdp_buff *xdp) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */
41 34 1 5 10 55 55 65 65 65 64 30 30 30 30 12 12 12 4 4 34 34 33 34 34 33 41 41 41 5 45 46 45 1 1 6 6 34 34 2 2 33 34 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2019, 2021 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS #define __MAC80211_DRIVER_OPS #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #define check_sdata_in_driver(sdata) ({ \ !WARN_ONCE(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ }) static inline struct ieee80211_sub_if_data * get_bss_sdata(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); return sdata; } static inline void drv_tx(struct ieee80211_local *local, struct ieee80211_tx_control *control, struct sk_buff *skb) { local->ops->tx(&local->hw, control, skb); } static inline void drv_sync_rx_queues(struct ieee80211_local *local, struct sta_info *sta) { if (local->ops->sync_rx_queues) { trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta); local->ops->sync_rx_queues(&local->hw); trace_drv_return_void(local); } } static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata, u32 sset, u8 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_strings) { trace_drv_get_et_strings(local, sset); local->ops->get_et_strings(&local->hw, &sdata->vif, sset, data); trace_drv_return_void(local); } } static inline void drv_get_et_stats(struct ieee80211_sub_if_data *sdata, struct ethtool_stats *stats, u64 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_stats) { trace_drv_get_et_stats(local); local->ops->get_et_stats(&local->hw, &sdata->vif, stats, data); trace_drv_return_void(local); } } static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata, int sset) { struct ieee80211_local *local = sdata->local; int rv = 0; if (local->ops->get_et_sset_count) { trace_drv_get_et_sset_count(local, sset); rv = local->ops->get_et_sset_count(&local->hw, &sdata->vif, sset); trace_drv_return_int(local, rv); } return rv; } int drv_start(struct ieee80211_local *local); void drv_stop(struct ieee80211_local *local); #ifdef CONFIG_PM static inline int drv_suspend(struct ieee80211_local *local, struct cfg80211_wowlan *wowlan) { int ret; might_sleep(); trace_drv_suspend(local); ret = local->ops->suspend(&local->hw, wowlan); trace_drv_return_int(local, ret); return ret; } static inline int drv_resume(struct ieee80211_local *local) { int ret; might_sleep(); trace_drv_resume(local); ret = local->ops->resume(&local->hw); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_wakeup(struct ieee80211_local *local, bool enabled) { might_sleep(); if (!local->ops->set_wakeup) return; trace_drv_set_wakeup(local, enabled); local->ops->set_wakeup(&local->hw, enabled); trace_drv_return_void(local); } #endif int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p); void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_config(struct ieee80211_local *local, u32 changed) { int ret; might_sleep(); trace_drv_config(local, changed); ret = local->ops->config(&local->hw, changed); trace_drv_return_int(local, ret); return ret; } static inline void drv_bss_info_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info, u32 changed) { might_sleep(); if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED) && sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT && sdata->vif.type != NL80211_IFTYPE_OCB)) return; if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !sdata->vif.mu_mimo_owner && !(changed & BSS_CHANGED_TXPOWER)))) return; if (!check_sdata_in_driver(sdata)) return; trace_drv_bss_info_changed(local, sdata, info, changed); if (local->ops->bss_info_changed) local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed); trace_drv_return_void(local); } static inline u64 drv_prepare_multicast(struct ieee80211_local *local, struct netdev_hw_addr_list *mc_list) { u64 ret = 0; trace_drv_prepare_multicast(local, mc_list->count); if (local->ops->prepare_multicast) ret = local->ops->prepare_multicast(&local->hw, mc_list); trace_drv_return_u64(local, ret); return ret; } static inline void drv_configure_filter(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, u64 multicast) { might_sleep(); trace_drv_configure_filter(local, changed_flags, total_flags, multicast); local->ops->configure_filter(&local->hw, changed_flags, total_flags, multicast); trace_drv_return_void(local); } static inline void drv_config_iface_filter(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int filter_flags, unsigned int changed_flags) { might_sleep(); trace_drv_config_iface_filter(local, sdata, filter_flags, changed_flags); if (local->ops->config_iface_filter) local->ops->config_iface_filter(&local->hw, &sdata->vif, filter_flags, changed_flags); trace_drv_return_void(local); } static inline int drv_set_tim(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set) { int ret = 0; trace_drv_set_tim(local, sta, set); if (local->ops->set_tim) ret = local->ops->set_tim(&local->hw, sta, set); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_key(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key) { int ret; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_set_key(local, cmd, sdata, sta, key); ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); trace_drv_return_int(local, ret); return ret; } static inline void drv_update_tkip_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_key_conf *conf, struct sta_info *sta, u32 iv32, u16 *phase1key) { struct ieee80211_sta *ista = NULL; if (sta) ista = &sta->sta; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_update_tkip_key(local, sdata, conf, ista, iv32); if (local->ops->update_tkip_key) local->ops->update_tkip_key(&local->hw, &sdata->vif, conf, ista, iv32, phase1key); trace_drv_return_void(local); } static inline int drv_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_request *req) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_hw_scan(local, sdata); ret = local->ops->hw_scan(&local->hw, &sdata->vif, req); trace_drv_return_int(local, ret); return ret; } static inline void drv_cancel_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_cancel_hw_scan(local, sdata); local->ops->cancel_hw_scan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_sched_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req, struct ieee80211_scan_ies *ies) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_start(local, sdata); ret = local->ops->sched_scan_start(&local->hw, &sdata->vif, req, ies); trace_drv_return_int(local, ret); return ret; } static inline int drv_sched_scan_stop(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_stop(local, sdata); ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_sw_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const u8 *mac_addr) { might_sleep(); trace_drv_sw_scan_start(local, sdata, mac_addr); if (local->ops->sw_scan_start) local->ops->sw_scan_start(&local->hw, &sdata->vif, mac_addr); trace_drv_return_void(local); } static inline void drv_sw_scan_complete(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); trace_drv_sw_scan_complete(local, sdata); if (local->ops->sw_scan_complete) local->ops->sw_scan_complete(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_get_stats(struct ieee80211_local *local, struct ieee80211_low_level_stats *stats) { int ret = -EOPNOTSUPP; might_sleep(); if (local->ops->get_stats) ret = local->ops->get_stats(&local->hw, stats); trace_drv_get_stats(local, stats, ret); return ret; } static inline void drv_get_key_seq(struct ieee80211_local *local, struct ieee80211_key *key, struct ieee80211_key_seq *seq) { if (local->ops->get_key_seq) local->ops->get_key_seq(&local->hw, &key->conf, seq); trace_drv_get_key_seq(local, &key->conf); } static inline int drv_set_frag_threshold(struct ieee80211_local *local, u32 value) { int ret = 0; might_sleep(); trace_drv_set_frag_threshold(local, value); if (local->ops->set_frag_threshold) ret = local->ops->set_frag_threshold(&local->hw, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_rts_threshold(struct ieee80211_local *local, u32 value) { int ret = 0; might_sleep(); trace_drv_set_rts_threshold(local, value); if (local->ops->set_rts_threshold) ret = local->ops->set_rts_threshold(&local->hw, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_coverage_class(struct ieee80211_local *local, s16 value) { int ret = 0; might_sleep(); trace_drv_set_coverage_class(local, value); if (local->ops->set_coverage_class) local->ops->set_coverage_class(&local->hw, value); else ret = -EOPNOTSUPP; trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_notify(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum sta_notify_cmd cmd, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_notify(local, sdata, cmd, sta); if (local->ops->sta_notify) local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta); trace_drv_return_void(local); } static inline int drv_sta_add(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { int ret = 0; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_add(local, sdata, sta); if (local->ops->sta_add) ret = local->ops->sta_add(&local->hw, &sdata->vif, sta); trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_remove(local, sdata, sta); if (local->ops->sta_remove) local->ops->sta_remove(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } #ifdef CONFIG_MAC80211_DEBUGFS static inline void drv_sta_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct dentry *dir) { might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->sta_add_debugfs) local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta); if (local->ops->sta_pre_rcu_remove) local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta); void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_rate_tbl_update(local, sdata, sta); if (local->ops->sta_rate_tbl_update) local->ops->sta_rate_tbl_update(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_sta_statistics(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct station_info *sinfo) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_statistics(local, sdata, sta); if (local->ops->sta_statistics) local->ops->sta_statistics(&local->hw, &sdata->vif, sta, sinfo); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 ac, const struct ieee80211_tx_queue_params *params); u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf); void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset); void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_tx_last_beacon(struct ieee80211_local *local) { int ret = 0; /* default unsupported op for less congestion */ might_sleep(); trace_drv_tx_last_beacon(local); if (local->ops->tx_last_beacon) ret = local->ops->tx_last_beacon(&local->hw); trace_drv_return_int(local, ret); return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) { int ret = -EOPNOTSUPP; trace_drv_get_survey(local, idx, survey); if (local->ops->get_survey) ret = local->ops->get_survey(&local->hw, idx, survey); trace_drv_return_int(local, ret); return ret; } static inline void drv_rfkill_poll(struct ieee80211_local *local) { might_sleep(); if (local->ops->rfkill_poll) local->ops->rfkill_poll(&local->hw); } static inline void drv_flush(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u32 queues, bool drop) { struct ieee80211_vif *vif = sdata ? &sdata->vif : NULL; might_sleep(); if (sdata && !check_sdata_in_driver(sdata)) return; trace_drv_flush(local, queues, drop); if (local->ops->flush) local->ops->flush(&local->hw, vif, queues, drop); trace_drv_return_void(local); } static inline void drv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { might_sleep(); trace_drv_channel_switch(local, sdata, ch_switch); local->ops->channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_void(local); } static inline int drv_set_antenna(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); if (local->ops->set_antenna) ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant); trace_drv_set_antenna(local, tx_ant, rx_ant, ret); return ret; } static inline int drv_get_antenna(struct ieee80211_local *local, u32 *tx_ant, u32 *rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); if (local->ops->get_antenna) ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant); trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret); return ret; } static inline int drv_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, unsigned int duration, enum ieee80211_roc_type type) { int ret; might_sleep(); trace_drv_remain_on_channel(local, sdata, chan, duration, type); ret = local->ops->remain_on_channel(&local->hw, &sdata->vif, chan, duration, type); trace_drv_return_int(local, ret); return ret; } static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); trace_drv_cancel_remain_on_channel(local, sdata); ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_ringparam(struct ieee80211_local *local, u32 tx, u32 rx) { int ret = -ENOTSUPP; might_sleep(); trace_drv_set_ringparam(local, tx, rx); if (local->ops->set_ringparam) ret = local->ops->set_ringparam(&local->hw, tx, rx); trace_drv_return_int(local, ret); return ret; } static inline void drv_get_ringparam(struct ieee80211_local *local, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max) { might_sleep(); trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max); if (local->ops->get_ringparam) local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max); trace_drv_return_void(local); } static inline bool drv_tx_frames_pending(struct ieee80211_local *local) { bool ret = false; might_sleep(); trace_drv_tx_frames_pending(local); if (local->ops->tx_frames_pending) ret = local->ops->tx_frames_pending(&local->hw); trace_drv_return_bool(local, ret); return ret; } static inline int drv_set_bitrate_mask(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_bitrate_mask *mask) { int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_set_bitrate_mask(local, sdata, mask); if (local->ops->set_bitrate_mask) ret = local->ops->set_bitrate_mask(&local->hw, &sdata->vif, mask); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_rekey_data(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_gtk_rekey_data *data) { if (!check_sdata_in_driver(sdata)) return; trace_drv_set_rekey_data(local, sdata, data); if (local->ops->set_rekey_data) local->ops->set_rekey_data(&local->hw, &sdata->vif, data); trace_drv_return_void(local); } static inline void drv_event_callback(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct ieee80211_event *event) { trace_drv_event_callback(local, sdata, event); if (local->ops->event_callback) local->ops->event_callback(&local->hw, &sdata->vif, event); trace_drv_return_void(local); } static inline void drv_release_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_release_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->release_buffered_frames) local->ops->release_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_allow_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_allow_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->allow_buffered_frames) local->ops->allow_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_prepare_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_prepare_tx) local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_complete_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_complete_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_complete_tx) local->ops->mgd_complete_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_protect_tdls_discover(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_protect_tdls_discover(local, sdata); if (local->ops->mgd_protect_tdls_discover) local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_add_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { int ret = -EOPNOTSUPP; might_sleep(); trace_drv_add_chanctx(local, ctx); if (local->ops->add_chanctx) ret = local->ops->add_chanctx(&local->hw, &ctx->conf); trace_drv_return_int(local, ret); if (!ret) ctx->driver_present = true; return ret; } static inline void drv_remove_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { might_sleep(); if (WARN_ON(!ctx->driver_present)) return; trace_drv_remove_chanctx(local, ctx); if (local->ops->remove_chanctx) local->ops->remove_chanctx(&local->hw, &ctx->conf); trace_drv_return_void(local); ctx->driver_present = false; } static inline void drv_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, u32 changed) { might_sleep(); trace_drv_change_chanctx(local, ctx, changed); if (local->ops->change_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->change_chanctx(&local->hw, &ctx->conf, changed); } trace_drv_return_void(local); } static inline int drv_assign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx) { int ret = 0; if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_assign_vif_chanctx(local, sdata, ctx); if (local->ops->assign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); ret = local->ops->assign_vif_chanctx(&local->hw, &sdata->vif, &ctx->conf); } trace_drv_return_int(local, ret); return ret; } static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_unassign_vif_chanctx(local, sdata, ctx); if (local->ops->unassign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->unassign_vif_chanctx(&local->hw, &sdata->vif, &ctx->conf); } trace_drv_return_void(local); } int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); static inline int drv_start_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_ap(local, sdata, &sdata->vif.bss_conf); if (local->ops->start_ap) ret = local->ops->start_ap(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { if (!check_sdata_in_driver(sdata)) return; trace_drv_stop_ap(local, sdata); if (local->ops->stop_ap) local->ops->stop_ap(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline void drv_reconfig_complete(struct ieee80211_local *local, enum ieee80211_reconfig_type reconfig_type) { might_sleep(); trace_drv_reconfig_complete(local, reconfig_type); if (local->ops->reconfig_complete) local->ops->reconfig_complete(&local->hw, reconfig_type); trace_drv_return_void(local); } static inline void drv_set_default_unicast_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int key_idx) { if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(key_idx < -1 || key_idx > 3); trace_drv_set_default_unicast_key(local, sdata, key_idx); if (local->ops->set_default_unicast_key) local->ops->set_default_unicast_key(&local->hw, &sdata->vif, key_idx); trace_drv_return_void(local); } #if IS_ENABLED(CONFIG_IPV6) static inline void drv_ipv6_addr_change(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct inet6_dev *idev) { trace_drv_ipv6_addr_change(local, sdata); if (local->ops->ipv6_addr_change) local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev); trace_drv_return_void(local); } #endif static inline void drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = sdata->local; if (local->ops->channel_switch_beacon) { trace_drv_channel_switch_beacon(local, sdata, chandef); local->ops->channel_switch_beacon(&local->hw, &sdata->vif, chandef); } } static inline int drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; int ret = 0; if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_pre_channel_switch(local, sdata, ch_switch); if (local->ops->pre_channel_switch) ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_int(local, ret); return ret; } static inline int drv_post_channel_switch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; int ret = 0; if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_post_channel_switch(local, sdata); if (local->ops->post_channel_switch) ret = local->ops->post_channel_switch(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; if (!check_sdata_in_driver(sdata)) return; trace_drv_abort_channel_switch(local, sdata); if (local->ops->abort_channel_switch) local->ops->abort_channel_switch(&local->hw, &sdata->vif); } static inline void drv_channel_switch_rx_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; if (!check_sdata_in_driver(sdata)) return; trace_drv_channel_switch_rx_beacon(local, sdata, ch_switch); if (local->ops->channel_switch_rx_beacon) local->ops->channel_switch_rx_beacon(&local->hw, &sdata->vif, ch_switch); } static inline int drv_join_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf); if (local->ops->join_ibss) ret = local->ops->join_ibss(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_leave_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_leave_ibss(local, sdata); if (local->ops->leave_ibss) local->ops->leave_ibss(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, struct sta_info *sta) { u32 ret = 0; trace_drv_get_expected_throughput(&sta->sta); if (local->ops->get_expected_throughput && sta->uploaded) ret = local->ops->get_expected_throughput(&local->hw, &sta->sta); trace_drv_return_u32(local, ret); return ret; } static inline int drv_get_txpower(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int *dbm) { int ret; if (!local->ops->get_txpower) return -EOPNOTSUPP; ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm); trace_drv_get_txpower(local, sdata, *dbm, ret); return ret; } static inline int drv_tdls_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef, struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; if (!local->ops->tdls_channel_switch) return -EOPNOTSUPP; trace_drv_tdls_channel_switch(local, sdata, sta, oper_class, chandef); ret = local->ops->tdls_channel_switch(&local->hw, &sdata->vif, sta, oper_class, chandef, tmpl_skb, ch_sw_tm_ie); trace_drv_return_int(local, ret); return ret; } static inline void drv_tdls_cancel_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->tdls_cancel_channel_switch) return; trace_drv_tdls_cancel_channel_switch(local, sdata, sta); local->ops->tdls_cancel_channel_switch(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_tdls_recv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_tdls_ch_sw_params *params) { trace_drv_tdls_recv_channel_switch(local, sdata, params); if (local->ops->tdls_recv_channel_switch) local->ops->tdls_recv_channel_switch(&local->hw, &sdata->vif, params); trace_drv_return_void(local); } static inline void drv_wake_tx_queue(struct ieee80211_local *local, struct txq_info *txq) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); /* In reconfig don't transmit now, but mark for waking later */ if (local->in_reconfig) { set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txq->flags); return; } if (!check_sdata_in_driver(sdata)) return; trace_drv_wake_tx_queue(local, sdata, txq); local->ops->wake_tx_queue(&local->hw, &txq->txq); } static inline void schedule_and_wake_txq(struct ieee80211_local *local, struct txq_info *txqi) { ieee80211_schedule_txq(&local->hw, &txqi->txq); drv_wake_tx_queue(local, txqi); } static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local, struct sk_buff *head, struct sk_buff *skb) { if (!local->ops->can_aggregate_in_amsdu) return true; return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb); } static inline int drv_get_ftm_responder_stats(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_ftm_responder_stats *ftm_stats) { u32 ret = -EOPNOTSUPP; if (local->ops->get_ftm_responder_stats) ret = local->ops->get_ftm_responder_stats(&local->hw, &sdata->vif, ftm_stats); trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats); return ret; } static inline int drv_start_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_pmsr(local, sdata); if (local->ops->start_pmsr) ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { trace_drv_abort_pmsr(local, sdata); might_sleep(); if (!check_sdata_in_driver(sdata)) return; if (local->ops->abort_pmsr) local->ops->abort_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_void(local); } static inline int drv_start_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf) { int ret; might_sleep(); check_sdata_in_driver(sdata); trace_drv_start_nan(local, sdata, conf); ret = local->ops->start_nan(&local->hw, &sdata->vif, conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); check_sdata_in_driver(sdata); trace_drv_stop_nan(local, sdata); local->ops->stop_nan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_nan_change_conf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf, u32 changes) { int ret; might_sleep(); check_sdata_in_driver(sdata); if (!local->ops->nan_change_conf) return -EOPNOTSUPP; trace_drv_nan_change_conf(local, sdata, conf, changes); ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf, changes); trace_drv_return_int(local, ret); return ret; } static inline int drv_add_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_nan_func *nan_func) { int ret; might_sleep(); check_sdata_in_driver(sdata); if (!local->ops->add_nan_func) return -EOPNOTSUPP; trace_drv_add_nan_func(local, sdata, nan_func); ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func); trace_drv_return_int(local, ret); return ret; } static inline void drv_del_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u8 instance_id) { might_sleep(); check_sdata_in_driver(sdata); trace_drv_del_nan_func(local, sdata, instance_id); if (local->ops->del_nan_func) local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id); trace_drv_return_void(local); } static inline int drv_set_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct cfg80211_tid_config *tid_conf) { int ret; might_sleep(); ret = local->ops->set_tid_config(&local->hw, &sdata->vif, sta, tid_conf); trace_drv_return_int(local, ret); return ret; } static inline int drv_reset_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 tids) { int ret; might_sleep(); ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids); trace_drv_return_int(local, ret); return ret; } static inline void drv_update_vif_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); check_sdata_in_driver(sdata); if (!local->ops->update_vif_offload) return; trace_drv_update_vif_offload(local, sdata); local->ops->update_vif_offload(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline void drv_sta_set_4addr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_4addr(local, sdata, sta, enabled); if (local->ops->sta_set_4addr) local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_decap_offload(local, sdata, sta, enabled); if (local->ops->sta_set_decap_offload) local->ops->sta_set_decap_offload(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_add_twt_setup(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt) { struct ieee80211_twt_params *twt_agrt; might_sleep(); if (!check_sdata_in_driver(sdata)) return; twt_agrt = (void *)twt->params; trace_drv_add_twt_setup(local, sta, twt, twt_agrt); local->ops->add_twt_setup(&local->hw, sta, twt); trace_drv_return_void(local); } static inline void drv_twt_teardown_request(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 flowid) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->twt_teardown_request) return; trace_drv_twt_teardown_request(local, sta, flowid); local->ops->twt_teardown_request(&local->hw, sta, flowid); trace_drv_return_void(local); } #endif /* __MAC80211_DRIVER_OPS */
69 4 4 1220 26 22 22 4 4 646 643 646 645 69 69 4 4 352 4 348 905 907 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> #include <linux/reboot.h> /* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations * and the like. */ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. */ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { WARN(1, "double register detected"); return 0; } if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); return 0; } static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) { rcu_assign_pointer(*nl, n->next); return 0; } nl = &((*nl)->next); } return -ENOENT; } /** * notifier_call_chain - Informs the registered notifiers about an event. * @nl: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: Number of notifier functions to be called. Don't care * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; nb = rcu_dereference_raw(*nl); while (nb && nr_to_call) { next_nb = rcu_dereference_raw(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; } #endif ret = nb->notifier_call(nb, val, v); if (nr_calls) (*nr_calls)++; if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; nr_to_call--; } return ret; } NOKPROBE_SYMBOL(notifier_call_chain); /** * notifier_call_chain_robust - Inform the registered notifiers about an event * and rollback on error. * @nl: Pointer to head of the blocking notifier chain * @val_up: Value passed unmodified to the notifier function * @val_down: Value passed unmodified to the notifier function when recovering * from an error on @val_up * @v Pointer passed unmodified to the notifier function * * NOTE: It is important the @nl chain doesn't change between the two * invocations of notifier_call_chain() such that we visit the * exact same notifier callbacks; this rules out any RCU usage. * * Returns: the return value of the @val_up call. */ static int notifier_call_chain_robust(struct notifier_block **nl, unsigned long val_up, unsigned long val_down, void *v) { int ret, nr = 0; ret = notifier_call_chain(nl, val_up, v, -1, &nr); if (ret & NOTIFY_STOP_MASK) notifier_call_chain(nl, val_down, v, nr-1, NULL); return ret; } /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** * atomic_notifier_chain_register - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain. * * Currently always returns zero. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an atomic notifier chain. * * Returns zero on success or %-ENOENT on failure. */ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; rcu_read_lock(); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ /** * blocking_notifier_chain_register - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain. * Must be called in process context. * * Currently always returns zero. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_register(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a blocking notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_unregister(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust); /** * blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* * Raw notifier chain routines. There is no protection; * the caller must provide it. Use at your own risk! */ /** * raw_notifier_chain_register - Add notifier to a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: New entry in notifier chain * * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * * Currently always returns zero. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_register(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); /** * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a raw notifier chain. * All locking must be provided by the caller. * * Returns zero on success or %-ENOENT on failure. */ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_unregister(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { return notifier_call_chain_robust(&nh->head, val_up, val_down, v); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust); /** * raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { return notifier_call_chain(&nh->head, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); #ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). */ /** * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: New entry in notifier chain * * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * * Currently always returns zero. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_register(&nh->head, n); mutex_unlock(&nh->mutex); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); /** * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an SRCU notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_unregister(&nh->head, n); mutex_unlock(&nh->mutex); synchronize_srcu(&nh->srcu); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** * srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); srcu_read_unlock(&nh->srcu, idx); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** * srcu_init_notifier_head - Initialize an SRCU notifier head * @nh: Pointer to head of the srcu notifier chain * * Unlike other sorts of notifier heads, SRCU notifier heads require * dynamic initialization. Be sure to call this routine before * calling any of the other SRCU notifier routines for this head. * * If an SRCU notifier head is deallocated, it must first be cleaned * up by calling srcu_cleanup_notifier_head(). Otherwise the head's * per-cpu data (used by the SRCU mechanism) will leak. */ void srcu_init_notifier_head(struct srcu_notifier_head *nh) { mutex_init(&nh->mutex); if (init_srcu_struct(&nh->srcu) < 0) BUG(); nh->head = NULL; } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); #endif /* CONFIG_SRCU */ static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { .regs = regs, .str = str, .err = err, .trapnr = trap, .signr = sig, }; RCU_LOCKDEP_WARN(!rcu_is_watching(), "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier);
16 20 4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ #include "netlink.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/genetlink.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" #include "log.h" #include "multicast.h" #include "network-coding.h" #include "originator.h" #include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" struct genl_family batadv_netlink_family; /* multicast groups */ enum batadv_netlink_multicast_groups { BATADV_NL_MCGRP_CONFIG, BATADV_NL_MCGRP_TPMETER, }; /** * enum batadv_genl_ops_flags - flags for genl_ops's internal_flags */ enum batadv_genl_ops_flags { /** * @BATADV_FLAG_NEED_MESH: request requires valid soft interface in * attribute BATADV_ATTR_MESH_IFINDEX and expects a pointer to it to be * saved in info->user_ptr[0] */ BATADV_FLAG_NEED_MESH = BIT(0), /** * @BATADV_FLAG_NEED_HARDIF: request requires valid hard interface in * attribute BATADV_ATTR_HARD_IFINDEX and expects a pointer to it to be * saved in info->user_ptr[1] */ BATADV_FLAG_NEED_HARDIF = BIT(1), /** * @BATADV_FLAG_NEED_VLAN: request requires valid vlan in * attribute BATADV_ATTR_VLANID and expects a pointer to it to be * saved in info->user_ptr[1] */ BATADV_FLAG_NEED_VLAN = BIT(2), }; static const struct genl_multicast_group batadv_netlink_mcgrps[] = { [BATADV_NL_MCGRP_CONFIG] = { .name = BATADV_NL_MCAST_GROUP_CONFIG }, [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, }; static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TQ] = { .type = NLA_U8 }, [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, [BATADV_ATTR_VLANID] = { .type = NLA_U16 }, [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_AP_ISOLATION_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 }, [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 }, [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_FRAGMENTATION_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_GW_BANDWIDTH_DOWN] = { .type = NLA_U32 }, [BATADV_ATTR_GW_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 }, [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 }, [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 }, }; /** * batadv_netlink_get_ifindex() - Extract an interface index from a message * @nlh: Message header * @attrtype: Attribute which holds an interface index * * Return: interface index, or 0. */ int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) { struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0; } /** * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation softif attribute * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg, struct batadv_priv *bat_priv) { struct batadv_softif_vlan *vlan; u8 ap_isolation; vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return 0; ap_isolation = atomic_read(&vlan->ap_isolation); batadv_softif_vlan_put(vlan); return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, !!ap_isolation); } /** * batadv_netlink_set_mesh_ap_isolation() - Set ap_isolation from genl msg * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr, struct batadv_priv *bat_priv) { struct batadv_softif_vlan *vlan; vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return -ENOENT; atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); batadv_softif_vlan_put(vlan); return 0; } /** * batadv_netlink_mesh_fill() - Fill message with mesh attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_mesh_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { struct net_device *soft_iface = bat_priv->soft_iface; struct batadv_hard_iface *primary_if = NULL; struct net_device *hard_iface; void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_priv->algo_ops->name) || nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, soft_iface->dev_addr) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, (u8)atomic_read(&bat_priv->tt.vn))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BLA if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC, ntohs(bat_priv->bla.claim_dest.group))) goto nla_put_failure; #endif if (batadv_mcast_mesh_info_put(msg, bat_priv)) goto nla_put_failure; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { hard_iface = primary_if->net_dev; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hard_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hard_iface->name) || nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, hard_iface->dev_addr)) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_AGGREGATED_OGMS_ENABLED, !!atomic_read(&bat_priv->aggregated_ogms))) goto nla_put_failure; if (batadv_netlink_mesh_fill_ap_isolation(msg, bat_priv)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MARK, bat_priv->isolation_mark)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MASK, bat_priv->isolation_mark_mask)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_BONDING_ENABLED, !!atomic_read(&bat_priv->bonding))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BLA if (nla_put_u8(msg, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, !!atomic_read(&bat_priv->bridge_loop_avoidance))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BLA */ #ifdef CONFIG_BATMAN_ADV_DAT if (nla_put_u8(msg, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, !!atomic_read(&bat_priv->distributed_arp_table))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DAT */ if (nla_put_u8(msg, BATADV_ATTR_FRAGMENTATION_ENABLED, !!atomic_read(&bat_priv->fragmentation))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_DOWN, atomic_read(&bat_priv->gw.bandwidth_down))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_UP, atomic_read(&bat_priv->gw.bandwidth_up))) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_GW_MODE, atomic_read(&bat_priv->gw.mode))) goto nla_put_failure; if (bat_priv->algo_ops->gw.get_best_gw_node && bat_priv->algo_ops->gw.is_eligible) { /* GW selection class is not available if the routing algorithm * in use does not implement the GW API */ if (nla_put_u32(msg, BATADV_ATTR_GW_SEL_CLASS, atomic_read(&bat_priv->gw.sel_class))) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, atomic_read(&bat_priv->hop_penalty))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_DEBUG if (nla_put_u32(msg, BATADV_ATTR_LOG_LEVEL, atomic_read(&bat_priv->log_level))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DEBUG */ #ifdef CONFIG_BATMAN_ADV_MCAST if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, !atomic_read(&bat_priv->multicast_mode))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_MULTICAST_FANOUT, atomic_read(&bat_priv->multicast_fanout))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED, !!atomic_read(&bat_priv->network_coding))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_NC */ if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; batadv_hardif_put(primary_if); genlmsg_end(msg, hdr); return 0; nla_put_failure: batadv_hardif_put(primary_if); genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_mesh() - send softif attributes to listener * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success, < 0 on error */ int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_SET_MESH, 0, 0, 0); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_get_mesh() - Get softif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_GET_MESH, info->snd_portid, info->snd_seq, 0); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_mesh() - Set softif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]) { attr = info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]; atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr)); } if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; batadv_netlink_set_mesh_ap_isolation(attr, bat_priv); } if (info->attrs[BATADV_ATTR_ISOLATION_MARK]) { attr = info->attrs[BATADV_ATTR_ISOLATION_MARK]; bat_priv->isolation_mark = nla_get_u32(attr); } if (info->attrs[BATADV_ATTR_ISOLATION_MASK]) { attr = info->attrs[BATADV_ATTR_ISOLATION_MASK]; bat_priv->isolation_mark_mask = nla_get_u32(attr); } if (info->attrs[BATADV_ATTR_BONDING_ENABLED]) { attr = info->attrs[BATADV_ATTR_BONDING_ENABLED]; atomic_set(&bat_priv->bonding, !!nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_BLA if (info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]) { attr = info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]; atomic_set(&bat_priv->bridge_loop_avoidance, !!nla_get_u8(attr)); batadv_bla_status_update(bat_priv->soft_iface); } #endif /* CONFIG_BATMAN_ADV_BLA */ #ifdef CONFIG_BATMAN_ADV_DAT if (info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]) { attr = info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]; atomic_set(&bat_priv->distributed_arp_table, !!nla_get_u8(attr)); batadv_dat_status_update(bat_priv->soft_iface); } #endif /* CONFIG_BATMAN_ADV_DAT */ if (info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]; atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr)); rtnl_lock(); batadv_update_min_mtu(bat_priv->soft_iface); rtnl_unlock(); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) { attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]; atomic_set(&bat_priv->gw.bandwidth_down, nla_get_u32(attr)); batadv_gw_tvlv_container_update(bat_priv); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]) { attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]; atomic_set(&bat_priv->gw.bandwidth_up, nla_get_u32(attr)); batadv_gw_tvlv_container_update(bat_priv); } if (info->attrs[BATADV_ATTR_GW_MODE]) { u8 gw_mode; attr = info->attrs[BATADV_ATTR_GW_MODE]; gw_mode = nla_get_u8(attr); if (gw_mode <= BATADV_GW_MODE_SERVER) { /* Invoking batadv_gw_reselect() is not enough to really * de-select the current GW. It will only instruct the * gateway client code to perform a re-election the next * time that this is needed. * * When gw client mode is being switched off the current * GW must be de-selected explicitly otherwise no GW_ADD * uevent is thrown on client mode re-activation. This * is operation is performed in * batadv_gw_check_client_stop(). */ batadv_gw_reselect(bat_priv); /* always call batadv_gw_check_client_stop() before * changing the gateway state */ batadv_gw_check_client_stop(bat_priv); atomic_set(&bat_priv->gw.mode, gw_mode); batadv_gw_tvlv_container_update(bat_priv); } } if (info->attrs[BATADV_ATTR_GW_SEL_CLASS] && bat_priv->algo_ops->gw.get_best_gw_node && bat_priv->algo_ops->gw.is_eligible) { /* setting the GW selection class is allowed only if the routing * algorithm in use implements the GW API */ u32 sel_class_max = 0xffffffffu; u32 sel_class; attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS]; sel_class = nla_get_u32(attr); if (!bat_priv->algo_ops->gw.store_sel_class) sel_class_max = BATADV_TQ_MAX_VALUE; if (sel_class >= 1 && sel_class <= sel_class_max) { atomic_set(&bat_priv->gw.sel_class, sel_class); batadv_gw_reselect(bat_priv); } } if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_DEBUG if (info->attrs[BATADV_ATTR_LOG_LEVEL]) { attr = info->attrs[BATADV_ATTR_LOG_LEVEL]; atomic_set(&bat_priv->log_level, nla_get_u32(attr) & BATADV_DBG_ALL); } #endif /* CONFIG_BATMAN_ADV_DEBUG */ #ifdef CONFIG_BATMAN_ADV_MCAST if (info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]) { attr = info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]; atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr)); } if (info->attrs[BATADV_ATTR_MULTICAST_FANOUT]) { attr = info->attrs[BATADV_ATTR_MULTICAST_FANOUT]; atomic_set(&bat_priv->multicast_fanout, nla_get_u32(attr)); } #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) { attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]; atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr)); batadv_nc_status_update(bat_priv->soft_iface); } #endif /* CONFIG_BATMAN_ADV_NC */ if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) { u32 orig_interval; attr = info->attrs[BATADV_ATTR_ORIG_INTERVAL]; orig_interval = nla_get_u32(attr); orig_interval = min_t(u32, orig_interval, INT_MAX); orig_interval = max_t(u32, orig_interval, 2 * BATADV_JITTER); atomic_set(&bat_priv->orig_interval, orig_interval); } batadv_netlink_notify_mesh(bat_priv); return 0; } /** * batadv_netlink_tp_meter_put() - Fill information of started tp_meter session * @msg: netlink message to be sent back * @cookie: tp meter session cookie * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) { if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) return -ENOBUFS; return 0; } /** * batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * * Return: 0 on success, < 0 on error */ int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, u32 cookie) { struct sk_buff *msg; void *hdr; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &batadv_netlink_family, 0, BATADV_CMD_TP_METER); if (!hdr) { ret = -ENOBUFS; goto err_genlmsg; } if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_TPMETER_TEST_TIME, test_time)) goto nla_put_failure; if (nla_put_u64_64bit(msg, BATADV_ATTR_TPMETER_BYTES, total_bytes, BATADV_ATTR_PAD)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_TPMETER_RESULT, result)) goto nla_put_failure; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, dst)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_TPMETER, GFP_KERNEL); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); ret = -EMSGSIZE; err_genlmsg: nlmsg_free(msg); return ret; } /** * batadv_netlink_tp_meter_start() - Start a new tp_meter session * @skb: received netlink message * @info: receiver information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg = NULL; u32 test_length; void *msg_head; u32 cookie; u8 *dst; int ret; if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) return -EINVAL; if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]) return -EINVAL; dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto out; } msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, &batadv_netlink_family, 0, BATADV_CMD_TP_METER); if (!msg_head) { ret = -ENOBUFS; goto out; } batadv_tp_start(bat_priv, dst, test_length, &cookie); ret = batadv_netlink_tp_meter_put(msg, cookie); out: if (ret) { if (msg) nlmsg_free(msg); return ret; } genlmsg_end(msg, msg_head); return genlmsg_reply(msg, info); } /** * batadv_netlink_tp_meter_cancel() - Cancel a running tp_meter session * @skb: received netlink message * @info: receiver information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; u8 *dst; int ret = 0; if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) return -EINVAL; dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL); return ret; } /** * batadv_netlink_hardif_fill() - Fill message with hardif attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * @hard_iface: hard interface which was modified * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * @cb: Control block containing additional options * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_hardif_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags, struct netlink_callback *cb) { struct net_device *net_dev = hard_iface->net_dev; void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, bat_priv->soft_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, bat_priv->soft_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, net_dev->name) || nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, net_dev->dev_addr)) goto nla_put_failure; if (hard_iface->if_status == BATADV_IF_ACTIVE) { if (nla_put_flag(msg, BATADV_ATTR_ACTIVE)) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, atomic_read(&hard_iface->hop_penalty))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT_OVERRIDE, atomic_read(&hard_iface->bat_v.throughput_override))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_hardif() - send hardif attributes to listener * @bat_priv: the bat priv with all the soft interface information * @hard_iface: hard interface which was modified * * Return: 0 on success, < 0 on error */ int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_SET_HARDIF, 0, 0, 0, NULL); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_get_hardif() - Get hardif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_get_hardif(struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_GET_HARDIF, info->snd_portid, info->snd_seq, 0, NULL); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_hardif() - Set hardif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_hardif(struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr)); } if (info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]) { attr = info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]; atomic_set(&hard_iface->bat_v.throughput_override, nla_get_u32(attr)); } #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ batadv_netlink_notify_hardif(bat_priv, hard_iface); return 0; } /** * batadv_netlink_dump_hardif() - Dump all hard interface into a messages * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: error code, or length of reply message on success */ static int batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv; int ifindex; int portid = NETLINK_CB(cb->skb).portid; int skip = cb->args[0]; int i = 0; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface) return -ENODEV; if (!batadv_softif_is_valid(soft_iface)) { dev_put(soft_iface); return -ENODEV; } bat_priv = netdev_priv(soft_iface); rtnl_lock(); cb->seq = batadv_hardif_generation << 1 | 1; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (i++ < skip) continue; if (batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_GET_HARDIF, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb)) { i--; break; } } rtnl_unlock(); dev_put(soft_iface); cb->args[0] = i; return msg->len; } /** * batadv_netlink_vlan_fill() - Fill message with vlan attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the soft interface information * @vlan: vlan which was modified * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_vlan_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, bat_priv->soft_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, bat_priv->soft_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, !!atomic_read(&vlan->ap_isolation))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_vlan() - send vlan attributes to listener * @bat_priv: the bat priv with all the soft interface information * @vlan: vlan which was modified * * Return: 0 on success, < 0 on error */ int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_SET_VLAN, 0, 0, 0); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->soft_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_get_vlan() - Get vlan attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info) { struct batadv_softif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_GET_VLAN, info->snd_portid, info->snd_seq, 0); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_vlan() - Get vlan attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info) { struct batadv_softif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); } batadv_netlink_notify_vlan(bat_priv, vlan); return 0; } /** * batadv_get_softif_from_info() - Retrieve soft interface from genl attributes * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to soft interface (with increased refcnt) on success, error * pointer on error */ static struct net_device * batadv_get_softif_from_info(struct net *net, struct genl_info *info) { struct net_device *soft_iface; int ifindex; if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) return ERR_PTR(-EINVAL); ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface) return ERR_PTR(-ENODEV); if (!batadv_softif_is_valid(soft_iface)) goto err_put_softif; return soft_iface; err_put_softif: dev_put(soft_iface); return ERR_PTR(-EINVAL); } /** * batadv_get_hardif_from_info() - Retrieve hardif from genl attributes * @bat_priv: the bat priv with all the soft interface information * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ static struct batadv_hard_iface * batadv_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net, struct genl_info *info) { struct batadv_hard_iface *hard_iface; struct net_device *hard_dev; unsigned int hardif_index; if (!info->attrs[BATADV_ATTR_HARD_IFINDEX]) return ERR_PTR(-EINVAL); hardif_index = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]); hard_dev = dev_get_by_index(net, hardif_index); if (!hard_dev) return ERR_PTR(-ENODEV); hard_iface = batadv_hardif_get_by_netdev(hard_dev); if (!hard_iface) goto err_put_harddev; if (hard_iface->soft_iface != bat_priv->soft_iface) goto err_put_hardif; /* hard_dev is referenced by hard_iface and not needed here */ dev_put(hard_dev); return hard_iface; err_put_hardif: batadv_hardif_put(hard_iface); err_put_harddev: dev_put(hard_dev); return ERR_PTR(-EINVAL); } /** * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes * @bat_priv: the bat priv with all the soft interface information * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to vlan on success (with increased refcnt), error pointer * on error */ static struct batadv_softif_vlan * batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net, struct genl_info *info) { struct batadv_softif_vlan *vlan; u16 vid; if (!info->attrs[BATADV_ATTR_VLANID]) return ERR_PTR(-EINVAL); vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]); vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return ERR_PTR(-ENOENT); return vlan; } /** * batadv_pre_doit() - Prepare batman-adv genl doit request * @ops: requested netlink operation * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv = NULL; struct batadv_softif_vlan *vlan; struct net_device *soft_iface; u8 user_ptr1_flags; u8 mesh_dep_flags; int ret; user_ptr1_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1)) return -EINVAL; mesh_dep_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON((ops->internal_flags & mesh_dep_flags) && (~ops->internal_flags & BATADV_FLAG_NEED_MESH))) return -EINVAL; if (ops->internal_flags & BATADV_FLAG_NEED_MESH) { soft_iface = batadv_get_softif_from_info(net, info); if (IS_ERR(soft_iface)) return PTR_ERR(soft_iface); bat_priv = netdev_priv(soft_iface); info->user_ptr[0] = bat_priv; } if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) { hard_iface = batadv_get_hardif_from_info(bat_priv, net, info); if (IS_ERR(hard_iface)) { ret = PTR_ERR(hard_iface); goto err_put_softif; } info->user_ptr[1] = hard_iface; } if (ops->internal_flags & BATADV_FLAG_NEED_VLAN) { vlan = batadv_get_vlan_from_info(bat_priv, net, info); if (IS_ERR(vlan)) { ret = PTR_ERR(vlan); goto err_put_softif; } info->user_ptr[1] = vlan; } return 0; err_put_softif: if (bat_priv) dev_put(bat_priv->soft_iface); return ret; } /** * batadv_post_doit() - End batman-adv genl doit request * @ops: requested netlink operation * @skb: Netlink message with request data * @info: receiver information */ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; struct batadv_priv *bat_priv; if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF && info->user_ptr[1]) { hard_iface = info->user_ptr[1]; batadv_hardif_put(hard_iface); } if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) { vlan = info->user_ptr[1]; batadv_softif_vlan_put(vlan); } if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) { bat_priv = info->user_ptr[0]; dev_put(bat_priv->soft_iface); } } static const struct genl_small_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = batadv_netlink_get_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_tp_meter_start, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER_CANCEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_tp_meter_cancel, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_GET_ROUTING_ALGOS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_algo_dump, }, { .cmd = BATADV_CMD_GET_HARDIF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .dumpit = batadv_netlink_dump_hardif, .doit = batadv_netlink_get_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_tt_local_dump, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_tt_global_dump, }, { .cmd = BATADV_CMD_GET_ORIGINATORS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_orig_dump, }, { .cmd = BATADV_CMD_GET_NEIGHBORS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_hardif_neigh_dump, }, { .cmd = BATADV_CMD_GET_GATEWAYS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_gw_dump, }, { .cmd = BATADV_CMD_GET_BLA_CLAIM, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_bla_claim_dump, }, { .cmd = BATADV_CMD_GET_BLA_BACKBONE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_bla_backbone_dump, }, { .cmd = BATADV_CMD_GET_DAT_CACHE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_dat_cache_dump, }, { .cmd = BATADV_CMD_GET_MCAST_FLAGS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_mcast_flags_dump, }, { .cmd = BATADV_CMD_SET_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_SET_HARDIF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_VLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = batadv_netlink_get_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, }, { .cmd = BATADV_CMD_SET_VLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, }, }; struct genl_family batadv_netlink_family __ro_after_init = { .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, .maxattr = BATADV_ATTR_MAX, .policy = batadv_netlink_policy, .netnsok = true, .pre_doit = batadv_pre_doit, .post_doit = batadv_post_doit, .module = THIS_MODULE, .small_ops = batadv_netlink_ops, .n_small_ops = ARRAY_SIZE(batadv_netlink_ops), .mcgrps = batadv_netlink_mcgrps, .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps), }; /** * batadv_netlink_register() - register batadv genl netlink family */ void __init batadv_netlink_register(void) { int ret; ret = genl_register_family(&batadv_netlink_family); if (ret) pr_warn("unable to register netlink family"); } /** * batadv_netlink_unregister() - unregister batadv genl netlink family */ void batadv_netlink_unregister(void) { genl_unregister_family(&batadv_netlink_family); }
6 5 1 2 1 1 1 1 3 3 3 3 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/rhashtable.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/seg6.h> #include <net/genetlink.h> #include <linux/seg6.h> #include <linux/seg6_genl.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) { unsigned int tlv_offset; int max_last_entry; int trailing; if (srh->type != IPV6_SRCRT_TYPE_4) return false; if (((srh->hdrlen + 1) << 3) != len) return false; if (!reduced && srh->segments_left > srh->first_segment) { return false; } else { max_last_entry = (srh->hdrlen / 2) - 1; if (srh->first_segment > max_last_entry) return false; if (srh->segments_left > srh->first_segment + 1) return false; } tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); trailing = len - tlv_offset; if (trailing < 0) return false; while (trailing) { struct sr6_tlv *tlv; unsigned int tlv_len; if (trailing < sizeof(*tlv)) return false; tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset); tlv_len = sizeof(*tlv) + tlv->len; trailing -= tlv_len; if (trailing < 0) return false; tlv_offset += tlv_len; } return true; } struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags) { struct ipv6_sr_hdr *srh; int len, srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0) return NULL; if (!pskb_may_pull(skb, srhoff + sizeof(*srh))) return NULL; srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); len = (srh->hdrlen + 1) << 3; if (!pskb_may_pull(skb, srhoff + len)) return NULL; /* note that pskb_may_pull may change pointers in header; * for this reason it is necessary to reload them when needed. */ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); if (!seg6_validate_srh(srh, len, true)) return NULL; return srh; } /* Determine if an ICMP invoking packet contains a segment routing * header. If it does, extract the offset to the true destination * address, which is in the first segment address. */ void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt) { __u16 network_header = skb->network_header; struct ipv6_sr_hdr *srh; /* Update network header to point to the invoking packet * inside the ICMP packet, so we can use the seg6_get_srh() * helper. */ skb_reset_network_header(skb); srh = seg6_get_srh(skb, 0); if (!srh) goto out; if (srh->type != IPV6_SRCRT_TYPE_4) goto out; opt->flags |= IP6SKB_SEG6; opt->srhoff = (unsigned char *)srh - skb->data; out: /* Restore the network header back to the ICMP packet */ skb->network_header = network_header; } static struct genl_family seg6_genl_family; static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = { [SEG6_ATTR_DST] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [SEG6_ATTR_DSTLEN] = { .type = NLA_S32, }, [SEG6_ATTR_HMACKEYID] = { .type = NLA_U32, }, [SEG6_ATTR_SECRET] = { .type = NLA_BINARY, }, [SEG6_ATTR_SECRETLEN] = { .type = NLA_U8, }, [SEG6_ATTR_ALGID] = { .type = NLA_U8, }, [SEG6_ATTR_HMACINFO] = { .type = NLA_NESTED, }, }; #ifdef CONFIG_IPV6_SEG6_HMAC static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct seg6_pernet_data *sdata; struct seg6_hmac_info *hinfo; u32 hmackeyid; char *secret; int err = 0; u8 algid; u8 slen; sdata = seg6_pernet(net); if (!info->attrs[SEG6_ATTR_HMACKEYID] || !info->attrs[SEG6_ATTR_SECRETLEN] || !info->attrs[SEG6_ATTR_ALGID]) return -EINVAL; hmackeyid = nla_get_u32(info->attrs[SEG6_ATTR_HMACKEYID]); slen = nla_get_u8(info->attrs[SEG6_ATTR_SECRETLEN]); algid = nla_get_u8(info->attrs[SEG6_ATTR_ALGID]); if (hmackeyid == 0) return -EINVAL; if (slen > SEG6_HMAC_SECRET_LEN) return -EINVAL; mutex_lock(&sdata->lock); hinfo = seg6_hmac_info_lookup(net, hmackeyid); if (!slen) { err = seg6_hmac_info_del(net, hmackeyid); goto out_unlock; } if (!info->attrs[SEG6_ATTR_SECRET]) { err = -EINVAL; goto out_unlock; } if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) { err = -EINVAL; goto out_unlock; } if (hinfo) { err = seg6_hmac_info_del(net, hmackeyid); if (err) goto out_unlock; } secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]); hinfo = kzalloc(sizeof(*hinfo), GFP_KERNEL); if (!hinfo) { err = -ENOMEM; goto out_unlock; } memcpy(hinfo->secret, secret, slen); hinfo->slen = slen; hinfo->alg_id = algid; hinfo->hmackeyid = hmackeyid; err = seg6_hmac_info_add(net, hmackeyid, hinfo); if (err) kfree(hinfo); out_unlock: mutex_unlock(&sdata->lock); return err; } #else static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) { return -ENOTSUPP; } #endif static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct in6_addr *val, *t_old, *t_new; struct seg6_pernet_data *sdata; sdata = seg6_pernet(net); if (!info->attrs[SEG6_ATTR_DST]) return -EINVAL; val = nla_data(info->attrs[SEG6_ATTR_DST]); t_new = kmemdup(val, sizeof(*val), GFP_KERNEL); if (!t_new) return -ENOMEM; mutex_lock(&sdata->lock); t_old = sdata->tun_src; rcu_assign_pointer(sdata->tun_src, t_new); mutex_unlock(&sdata->lock); synchronize_net(); kfree(t_old); return 0; } static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct in6_addr *tun_src; struct sk_buff *msg; void *hdr; msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC); if (!hdr) goto free_msg; rcu_read_lock(); tun_src = rcu_dereference(seg6_pernet(net)->tun_src); if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src)) goto nla_put_failure; rcu_read_unlock(); genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); nla_put_failure: rcu_read_unlock(); free_msg: nlmsg_free(msg); return -ENOMEM; } #ifdef CONFIG_IPV6_SEG6_HMAC static int __seg6_hmac_fill_info(struct seg6_hmac_info *hinfo, struct sk_buff *msg) { if (nla_put_u32(msg, SEG6_ATTR_HMACKEYID, hinfo->hmackeyid) || nla_put_u8(msg, SEG6_ATTR_SECRETLEN, hinfo->slen) || nla_put(msg, SEG6_ATTR_SECRET, hinfo->slen, hinfo->secret) || nla_put_u8(msg, SEG6_ATTR_ALGID, hinfo->alg_id)) return -1; return 0; } static int __seg6_genl_dumphmac_element(struct seg6_hmac_info *hinfo, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &seg6_genl_family, flags, cmd); if (!hdr) return -ENOMEM; if (__seg6_hmac_fill_info(hinfo, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int seg6_genl_dumphmac_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct seg6_pernet_data *sdata; struct rhashtable_iter *iter; sdata = seg6_pernet(net); iter = (struct rhashtable_iter *)cb->args[0]; if (!iter) { iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long)iter; } rhashtable_walk_enter(&sdata->hmac_infos, iter); return 0; } static int seg6_genl_dumphmac_done(struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; rhashtable_walk_exit(iter); kfree(iter); return 0; } static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; struct seg6_hmac_info *hinfo; int ret; rhashtable_walk_start(iter); for (;;) { hinfo = rhashtable_walk_next(iter); if (IS_ERR(hinfo)) { if (PTR_ERR(hinfo) == -EAGAIN) continue; ret = PTR_ERR(hinfo); goto done; } else if (!hinfo) { break; } ret = __seg6_genl_dumphmac_element(hinfo, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, SEG6_CMD_DUMPHMAC); if (ret) goto done; } ret = skb->len; done: rhashtable_walk_stop(iter); return ret; } #else static int seg6_genl_dumphmac_start(struct netlink_callback *cb) { return 0; } static int seg6_genl_dumphmac_done(struct netlink_callback *cb) { return 0; } static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) { return -ENOTSUPP; } #endif static int __net_init seg6_net_init(struct net *net) { struct seg6_pernet_data *sdata; sdata = kzalloc(sizeof(*sdata), GFP_KERNEL); if (!sdata) return -ENOMEM; mutex_init(&sdata->lock); sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL); if (!sdata->tun_src) { kfree(sdata); return -ENOMEM; } net->ipv6.seg6_data = sdata; #ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_net_init(net); #endif return 0; } static void __net_exit seg6_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); #ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_net_exit(net); #endif kfree(sdata->tun_src); kfree(sdata); } static struct pernet_operations ip6_segments_ops = { .init = seg6_net_init, .exit = seg6_net_exit, }; static const struct genl_ops seg6_genl_ops[] = { { .cmd = SEG6_CMD_SETHMAC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_sethmac, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_DUMPHMAC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .start = seg6_genl_dumphmac_start, .dumpit = seg6_genl_dumphmac, .done = seg6_genl_dumphmac_done, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_SET_TUNSRC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_set_tunsrc, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_GET_TUNSRC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_get_tunsrc, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family seg6_genl_family __ro_after_init = { .hdrsize = 0, .name = SEG6_GENL_NAME, .version = SEG6_GENL_VERSION, .maxattr = SEG6_ATTR_MAX, .policy = seg6_genl_policy, .netnsok = true, .parallel_ops = true, .ops = seg6_genl_ops, .n_ops = ARRAY_SIZE(seg6_genl_ops), .module = THIS_MODULE, }; int __init seg6_init(void) { int err; err = register_pernet_subsys(&ip6_segments_ops); if (err) goto out; err = genl_register_family(&seg6_genl_family); if (err) goto out_unregister_pernet; #ifdef CONFIG_IPV6_SEG6_LWTUNNEL err = seg6_iptunnel_init(); if (err) goto out_unregister_genl; err = seg6_local_init(); if (err) { seg6_iptunnel_exit(); goto out_unregister_genl; } #endif #ifdef CONFIG_IPV6_SEG6_HMAC err = seg6_hmac_init(); if (err) goto out_unregister_iptun; #endif pr_info("Segment Routing with IPv6\n"); out: return err; #ifdef CONFIG_IPV6_SEG6_HMAC out_unregister_iptun: #ifdef CONFIG_IPV6_SEG6_LWTUNNEL seg6_local_exit(); seg6_iptunnel_exit(); #endif #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL out_unregister_genl: #endif #if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL) || IS_ENABLED(CONFIG_IPV6_SEG6_HMAC) genl_unregister_family(&seg6_genl_family); #endif out_unregister_pernet: unregister_pernet_subsys(&ip6_segments_ops); goto out; } void seg6_exit(void) { #ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_exit(); #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL seg6_local_exit(); seg6_iptunnel_exit(); #endif genl_unregister_family(&seg6_genl_family); unregister_pernet_subsys(&ip6_segments_ops); }
15 15 15 7 5 11 31 2 29 26 3 3 12 22 22 14 8 13 13 16 23 6 52 13 31 5 13 23 62 15 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ #include <linux/skmsg.h> #include <linux/bpf.h> #include <net/sock.h> #include <net/af_unix.h> #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ !list_empty(&__psock->ingress_msg); \ }) static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct unix_sock *u = unix_sk(sk); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); if (!unix_sk_has_data(sk, psock)) { mutex_unlock(&u->iolock); wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); mutex_lock(&u->iolock); ret = unix_sk_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { if (sk->sk_type == SOCK_DGRAM) return __unix_dgram_recvmsg(sk, msg, len, flags); else return __unix_stream_recvmsg(sk, msg, len, flags); } static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; int copied; if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return __unix_recvmsg(sk, msg, len, flags); mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, nonblock); data = unix_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return copied; } static struct proto *unix_dgram_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_dgram_prot_lock); static struct proto unix_dgram_bpf_prot; static struct proto *unix_stream_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_stream_prot_lock); static struct proto unix_stream_bpf_prot; static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void unix_stream_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; prot->unhash = sock_map_unhash; } static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { spin_lock_bh(&unix_dgram_prot_lock); if (likely(ops != unix_dgram_prot_saved)) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); smp_store_release(&unix_dgram_prot_saved, ops); } spin_unlock_bh(&unix_dgram_prot_lock); } } static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { spin_lock_bh(&unix_stream_prot_lock); if (likely(ops != unix_stream_prot_saved)) { unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); smp_store_release(&unix_stream_prot_saved, ops); } spin_unlock_bh(&unix_stream_prot_lock); } } int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (sk->sk_type != SOCK_DGRAM) return -EOPNOTSUPP; if (restore) { sk->sk_write_space = psock->saved_write_space; WRITE_ONCE(sk->sk_prot, psock->sk_proto); return 0; } unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot); return 0; } int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (restore) { sk->sk_write_space = psock->saved_write_space; WRITE_ONCE(sk->sk_prot, psock->sk_proto); return 0; } unix_stream_bpf_check_needs_rebuild(psock->sk_proto); WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot); return 0; } void __init unix_bpf_build_proto(void) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); }
53 22 22 22 146 96 54 53 53 54 34 34 34 34 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> /* cgroup namespaces */ static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); } static void dec_cgroup_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); } static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } EXPORT_SYMBOL(free_cgroup_ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); if (!(flags & CLONE_NEWCGROUP)) { get_cgroup_ns(old_ns); return old_ns; } /* Allow only sysadmin to create cgroup namespace. */ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) { return container_of(ns, struct cgroup_namespace, ns); } static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Don't need to do anything if we are attaching to our own cgroupns. */ if (cgroup_ns == nsproxy->cgroup_ns) return 0; get_cgroup_ns(cgroup_ns); put_cgroup_ns(nsproxy->cgroup_ns); nsproxy->cgroup_ns = cgroup_ns; return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) { struct cgroup_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->cgroup_ns; get_cgroup_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void cgroupns_put(struct ns_common *ns) { put_cgroup_ns(to_cg_ns(ns)); } static struct user_namespace *cgroupns_owner(struct ns_common *ns) { return to_cg_ns(ns)->user_ns; } const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, };
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + prandom_u32_max(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f; int nfilter; spin_lock_bh(&jsk->filters_lock); f = jsk->filters; nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; goto filter_match_found; } spin_unlock_bh(&jsk->filters_lock); return false; filter_match_found: spin_unlock_bh(&jsk->filters_lock); return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb(&jsk->sk, skb) < 0) kfree_skb(skb); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } read_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, 0, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_ts_and_drops(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data) + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(sk->sk_priority); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } while (todo_size); switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } read_unlock_bh(&priv->j1939_socks_lock); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, };
94 139 62 228 3 421 16 1 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2020 Christoph Hellwig. * * Support for "universal" pointers that can point to either kernel or userspace * memory. */ #ifndef _LINUX_SOCKPTR_H #define _LINUX_SOCKPTR_H #include <linux/slab.h> #include <linux/uaccess.h> typedef struct { union { void *kernel; void __user *user; }; bool is_kernel : 1; } sockptr_t; static inline bool sockptr_is_kernel(sockptr_t sockptr) { return sockptr.is_kernel; } static inline sockptr_t KERNEL_SOCKPTR(void *p) { return (sockptr_t) { .kernel = p, .is_kernel = true }; } static inline sockptr_t USER_SOCKPTR(void __user *p) { return (sockptr_t) { .user = p }; } static inline bool sockptr_is_null(sockptr_t sockptr) { if (sockptr_is_kernel(sockptr)) return !sockptr.kernel; return !sockptr.user; } static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { if (!sockptr_is_kernel(src)) return copy_from_user(dst, src.user + offset, size); memcpy(dst, src.kernel + offset, size); return 0; } static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { if (!sockptr_is_kernel(dst)) return copy_to_user(dst.user + offset, src, size); memcpy(dst.kernel + offset, src, size); return 0; } static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) { return copy_to_sockptr_offset(dst, 0, src, size); } static inline void *memdup_sockptr(sockptr_t src, size_t len) { void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_sockptr(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } static inline void *memdup_sockptr_nul(sockptr_t src, size_t len) { char *p = kmalloc_track_caller(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_sockptr(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count) { if (sockptr_is_kernel(src)) { size_t len = min(strnlen(src.kernel, count - 1) + 1, count); memcpy(dst, src.kernel, len); return len; } return strncpy_from_user(dst, src.user, count); } #endif /* _LINUX_SOCKPTR_H */
3 18 7 7 2 2 2 2 3 15 20 1 3 16 1 14 2 13 1 1 1 1 1 7 7 6 1 5 2 2 5 5 3 3 3 3 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 /* * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bug.h> #include <linux/kdev_t.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/proc_ns.h> #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> /* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { const struct bpf_prog_offload_ops *ops; struct list_head netdevs; void *priv; }; struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; struct bpf_offload_dev *offdev; struct list_head progs; struct list_head maps; struct list_head offdev_netdevs; }; static const struct rhashtable_params offdevs_params = { .nelem_hint = 4, .key_len = sizeof(struct net_device *), .key_offset = offsetof(struct bpf_offload_netdev, netdev), .head_offset = offsetof(struct bpf_offload_netdev, l), .automatic_shrinking = true, }; static struct rhashtable offdevs; static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { if (!netdev) return -EINVAL; if (!netdev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; return 0; } static struct bpf_offload_netdev * bpf_offload_find_netdev(struct net_device *netdev) { lockdep_assert_held(&bpf_devs_lock); if (!offdevs_inited) return NULL; return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); } int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) return -EINVAL; if (attr->prog_flags) return -EINVAL; offload = kzalloc(sizeof(*offload), GFP_USER); if (!offload) return -ENOMEM; offload->prog = prog; offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); err = bpf_dev_offload_check(offload->netdev); if (err) goto err_maybe_put; down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offload->netdev); if (!ondev) { err = -EINVAL; goto err_unlock; } offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); up_write(&bpf_devs_lock); return 0; err_unlock: up_write(&bpf_devs_lock); err_maybe_put: if (offload->netdev) dev_put(offload->netdev); kfree(offload); return err; } int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) { ret = offload->offdev->ops->prepare(prog); offload->dev_state = !ret; } up_read(&bpf_devs_lock); return ret; } int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) ret = offload->offdev->ops->insn_hook(env, insn_idx, prev_insn_idx); up_read(&bpf_devs_lock); return ret; } int bpf_prog_offload_finalize(struct bpf_verifier_env *env) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { if (offload->offdev->ops->finalize) ret = offload->offdev->ops->finalize(env); else ret = 0; } up_read(&bpf_devs_lock); return ret; } void bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn) { const struct bpf_prog_offload_ops *ops; struct bpf_prog_offload *offload; int ret = -EOPNOTSUPP; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { ops = offload->offdev->ops; if (!offload->opt_failed && ops->replace_insn) ret = ops->replace_insn(env, off, insn); offload->opt_failed |= ret; } up_read(&bpf_devs_lock); } void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_prog_offload *offload; int ret = -EOPNOTSUPP; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { if (!offload->opt_failed && offload->offdev->ops->remove_insns) ret = offload->offdev->ops->remove_insns(env, off, cnt); offload->opt_failed |= ret; } up_read(&bpf_devs_lock); } static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; if (offload->dev_state) offload->offdev->ops->destroy(prog); list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; } void bpf_prog_offload_destroy(struct bpf_prog *prog) { down_write(&bpf_devs_lock); if (prog->aux->offload) __bpf_prog_offload_destroy(prog); up_write(&bpf_devs_lock); } static int bpf_prog_offload_translate(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) ret = offload->offdev->ops->translate(prog); up_read(&bpf_devs_lock); return ret; } static unsigned int bpf_prog_warn_on_exec(const void *ctx, const struct bpf_insn *insn) { WARN(1, "attempt to execute device eBPF program on the host!"); return 0; } int bpf_prog_offload_compile(struct bpf_prog *prog) { prog->bpf_func = bpf_prog_warn_on_exec; return bpf_prog_offload_translate(prog); } struct ns_get_path_bpf_prog_args { struct bpf_prog *prog; struct bpf_prog_info *info; }; static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) { struct ns_get_path_bpf_prog_args *args = private_data; struct bpf_prog_aux *aux = args->prog->aux; struct ns_common *ns; struct net *net; rtnl_lock(); down_read(&bpf_devs_lock); if (aux->offload) { args->info->ifindex = aux->offload->netdev->ifindex; net = dev_net(aux->offload->netdev); get_net(net); ns = &net->ns; } else { args->info->ifindex = 0; ns = NULL; } up_read(&bpf_devs_lock); rtnl_unlock(); return ns; } int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog) { struct ns_get_path_bpf_prog_args args = { .prog = prog, .info = info, }; struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; char __user *uinsns; int res; u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (res) { if (!info->ifindex) return -ENODEV; return res; } down_read(&bpf_devs_lock); if (!aux->offload) { up_read(&bpf_devs_lock); return -ENODEV; } ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { up_read(&bpf_devs_lock); return -EFAULT; } } up_read(&bpf_devs_lock); ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; path_put(&ns_path); return 0; } const struct bpf_prog_ops bpf_offload_prog_ops = { }; static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, enum bpf_netdev_command cmd) { struct netdev_bpf data = {}; struct net_device *netdev; ASSERT_RTNL(); data.command = cmd; data.offmap = offmap; /* Caller must make sure netdev is valid */ netdev = offmap->netdev; return netdev->netdev_ops->ndo_bpf(netdev, &data); } struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (attr->map_type != BPF_MAP_TYPE_ARRAY && attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = kzalloc(sizeof(*offmap), GFP_USER); if (!offmap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&offmap->map, attr); rtnl_lock(); down_write(&bpf_devs_lock); offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); err = bpf_dev_offload_check(offmap->netdev); if (err) goto err_unlock; ondev = bpf_offload_find_netdev(offmap->netdev); if (!ondev) { err = -EINVAL; goto err_unlock; } err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); return &offmap->map; err_unlock: up_write(&bpf_devs_lock); rtnl_unlock(); kfree(offmap); return ERR_PTR(err); } static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) { WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ bpf_map_free_id(&offmap->map, true); list_del_init(&offmap->offloads); offmap->netdev = NULL; } void bpf_map_offload_map_free(struct bpf_map *map) { struct bpf_offloaded_map *offmap = map_to_offmap(map); rtnl_lock(); down_write(&bpf_devs_lock); if (offmap->netdev) __bpf_map_offload_destroy(offmap); up_write(&bpf_devs_lock); rtnl_unlock(); kfree(offmap); } int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; if (unlikely(flags > BPF_EXIST)) return -EINVAL; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_update_elem(offmap, key, value, flags); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_delete_elem(offmap, key); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); up_read(&bpf_devs_lock); return ret; } struct ns_get_path_bpf_map_args { struct bpf_offloaded_map *offmap; struct bpf_map_info *info; }; static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) { struct ns_get_path_bpf_map_args *args = private_data; struct ns_common *ns; struct net *net; rtnl_lock(); down_read(&bpf_devs_lock); if (args->offmap->netdev) { args->info->ifindex = args->offmap->netdev->ifindex; net = dev_net(args->offmap->netdev); get_net(net); ns = &net->ns; } else { args->info->ifindex = 0; ns = NULL; } up_read(&bpf_devs_lock); rtnl_unlock(); return ns; } int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct ns_get_path_bpf_map_args args = { .offmap = map_to_offmap(map), .info = info, }; struct inode *ns_inode; struct path ns_path; int res; res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); if (res) { if (!info->ifindex) return -ENODEV; return res; } ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; path_put(&ns_path); return 0; } static bool __bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) { struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; if (!bpf_prog_is_dev_bound(prog->aux)) return false; offload = prog->aux->offload; if (!offload) return false; if (offload->netdev == netdev) return true; ondev1 = bpf_offload_find_netdev(offload->netdev); ondev2 = bpf_offload_find_netdev(netdev); return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; } bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) { bool ret; down_read(&bpf_devs_lock); ret = __bpf_offload_dev_match(prog, netdev); up_read(&bpf_devs_lock); return ret; } EXPORT_SYMBOL_GPL(bpf_offload_dev_match); bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; bool ret; if (!bpf_map_is_dev_bound(map)) return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); down_read(&bpf_devs_lock); ret = __bpf_offload_dev_match(prog, offmap->netdev); up_read(&bpf_devs_lock); return ret; } int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { struct bpf_offload_netdev *ondev; int err; ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); if (!ondev) return -ENOMEM; ondev->netdev = netdev; ondev->offdev = offdev; INIT_LIST_HEAD(&ondev->progs); INIT_LIST_HEAD(&ondev->maps); down_write(&bpf_devs_lock); err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); if (err) { netdev_warn(netdev, "failed to register for BPF offload\n"); goto err_unlock_free; } list_add(&ondev->offdev_netdevs, &offdev->netdevs); up_write(&bpf_devs_lock); return 0; err_unlock_free: up_write(&bpf_devs_lock); kfree(ondev); return err; } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { struct bpf_offload_netdev *ondev, *altdev; struct bpf_offloaded_map *offmap, *mtmp; struct bpf_prog_offload *offload, *ptmp; ASSERT_RTNL(); down_write(&bpf_devs_lock); ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); if (WARN_ON(!ondev)) goto unlock; WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); list_del(&ondev->offdev_netdevs); /* Try to move the objects to another netdev of the device */ altdev = list_first_entry_or_null(&offdev->netdevs, struct bpf_offload_netdev, offdev_netdevs); if (altdev) { list_for_each_entry(offload, &ondev->progs, offloads) offload->netdev = altdev->netdev; list_splice_init(&ondev->progs, &altdev->progs); list_for_each_entry(offmap, &ondev->maps, offloads) offmap->netdev = altdev->netdev; list_splice_init(&ondev->maps, &altdev->maps); } else { list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) __bpf_prog_offload_destroy(offload->prog); list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) __bpf_map_offload_destroy(offmap); } WARN_ON(!list_empty(&ondev->progs)); WARN_ON(!list_empty(&ondev->maps)); kfree(ondev); unlock: up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; int err; down_write(&bpf_devs_lock); if (!offdevs_inited) { err = rhashtable_init(&offdevs, &offdevs_params); if (err) { up_write(&bpf_devs_lock); return ERR_PTR(err); } offdevs_inited = true; } up_write(&bpf_devs_lock); offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); if (!offdev) return ERR_PTR(-ENOMEM); offdev->ops = ops; offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; } EXPORT_SYMBOL_GPL(bpf_offload_dev_create); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) { WARN_ON(!list_empty(&offdev->netdevs)); kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) { return offdev->priv; } EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
20 56 526 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Integer base 2 logarithm calculation * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_LOG2_H #define _LINUX_LOG2_H #include <linux/types.h> #include <linux/bitops.h> /* * non-constant log of base 2 calculators * - the arch may override these in asm/bitops.h if they can be implemented * more efficiently than using fls() and fls64() * - the arch is not required to handle n==0 if implementing the fallback */ #ifndef CONFIG_ARCH_HAS_ILOG2_U32 static inline __attribute__((const)) int __ilog2_u32(u32 n) { return fls(n) - 1; } #endif #ifndef CONFIG_ARCH_HAS_ILOG2_U64 static inline __attribute__((const)) int __ilog2_u64(u64 n) { return fls64(n) - 1; } #endif /** * is_power_of_2() - check if a value is a power of two * @n: the value to check * * Determine whether some value is a power of two, where zero is * *not* considered a power of two. * Return: true if @n is a power of 2, otherwise false. */ static inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); } /** * __roundup_pow_of_two() - round up to nearest power of two * @n: value to round up */ static inline __attribute__((const)) unsigned long __roundup_pow_of_two(unsigned long n) { return 1UL << fls_long(n - 1); } /** * __rounddown_pow_of_two() - round down to nearest power of two * @n: value to round down */ static inline __attribute__((const)) unsigned long __rounddown_pow_of_two(unsigned long n) { return 1UL << (fls_long(n) - 1); } /** * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value * @n: parameter * * Use this where sparse expects a true constant expression, e.g. for array * indices. */ #define const_ilog2(n) \ ( \ __builtin_constant_p(n) ? ( \ (n) < 2 ? 0 : \ (n) & (1ULL << 63) ? 63 : \ (n) & (1ULL << 62) ? 62 : \ (n) & (1ULL << 61) ? 61 : \ (n) & (1ULL << 60) ? 60 : \ (n) & (1ULL << 59) ? 59 : \ (n) & (1ULL << 58) ? 58 : \ (n) & (1ULL << 57) ? 57 : \ (n) & (1ULL << 56) ? 56 : \ (n) & (1ULL << 55) ? 55 : \ (n) & (1ULL << 54) ? 54 : \ (n) & (1ULL << 53) ? 53 : \ (n) & (1ULL << 52) ? 52 : \ (n) & (1ULL << 51) ? 51 : \ (n) & (1ULL << 50) ? 50 : \ (n) & (1ULL << 49) ? 49 : \ (n) & (1ULL << 48) ? 48 : \ (n) & (1ULL << 47) ? 47 : \ (n) & (1ULL << 46) ? 46 : \ (n) & (1ULL << 45) ? 45 : \ (n) & (1ULL << 44) ? 44 : \ (n) & (1ULL << 43) ? 43 : \ (n) & (1ULL << 42) ? 42 : \ (n) & (1ULL << 41) ? 41 : \ (n) & (1ULL << 40) ? 40 : \ (n) & (1ULL << 39) ? 39 : \ (n) & (1ULL << 38) ? 38 : \ (n) & (1ULL << 37) ? 37 : \ (n) & (1ULL << 36) ? 36 : \ (n) & (1ULL << 35) ? 35 : \ (n) & (1ULL << 34) ? 34 : \ (n) & (1ULL << 33) ? 33 : \ (n) & (1ULL << 32) ? 32 : \ (n) & (1ULL << 31) ? 31 : \ (n) & (1ULL << 30) ? 30 : \ (n) & (1ULL << 29) ? 29 : \ (n) & (1ULL << 28) ? 28 : \ (n) & (1ULL << 27) ? 27 : \ (n) & (1ULL << 26) ? 26 : \ (n) & (1ULL << 25) ? 25 : \ (n) & (1ULL << 24) ? 24 : \ (n) & (1ULL << 23) ? 23 : \ (n) & (1ULL << 22) ? 22 : \ (n) & (1ULL << 21) ? 21 : \ (n) & (1ULL << 20) ? 20 : \ (n) & (1ULL << 19) ? 19 : \ (n) & (1ULL << 18) ? 18 : \ (n) & (1ULL << 17) ? 17 : \ (n) & (1ULL << 16) ? 16 : \ (n) & (1ULL << 15) ? 15 : \ (n) & (1ULL << 14) ? 14 : \ (n) & (1ULL << 13) ? 13 : \ (n) & (1ULL << 12) ? 12 : \ (n) & (1ULL << 11) ? 11 : \ (n) & (1ULL << 10) ? 10 : \ (n) & (1ULL << 9) ? 9 : \ (n) & (1ULL << 8) ? 8 : \ (n) & (1ULL << 7) ? 7 : \ (n) & (1ULL << 6) ? 6 : \ (n) & (1ULL << 5) ? 5 : \ (n) & (1ULL << 4) ? 4 : \ (n) & (1ULL << 3) ? 3 : \ (n) & (1ULL << 2) ? 2 : \ 1) : \ -1) /** * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value * @n: parameter * * constant-capable log of base 2 calculation * - this can be used to initialise global variables from constant data, hence * the massive ternary operator construction * * selects the appropriately-sized optimised version depending on sizeof(n) */ #define ilog2(n) \ ( \ __builtin_constant_p(n) ? \ ((n) < 2 ? 0 : \ 63 - __builtin_clzll(n)) : \ (sizeof(n) <= 4) ? \ __ilog2_u32(n) : \ __ilog2_u64(n) \ ) /** * roundup_pow_of_two - round the given value up to nearest power of two * @n: parameter * * round the given value up to the nearest power of two * - the result is undefined when n == 0 * - this can be used to initialise global variables from constant data */ #define roundup_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 1) ? 1 : \ (1UL << (ilog2((n) - 1) + 1)) \ ) : \ __roundup_pow_of_two(n) \ ) /** * rounddown_pow_of_two - round the given value down to nearest power of two * @n: parameter * * round the given value down to the nearest power of two * - the result is undefined when n == 0 * - this can be used to initialise global variables from constant data */ #define rounddown_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ (1UL << ilog2(n))) : \ __rounddown_pow_of_two(n) \ ) static inline __attribute_const__ int __order_base_2(unsigned long n) { return n > 1 ? ilog2(n - 1) + 1 : 0; } /** * order_base_2 - calculate the (rounded up) base 2 order of the argument * @n: parameter * * The first few values calculated by this routine: * ob2(0) = 0 * ob2(1) = 0 * ob2(2) = 1 * ob2(3) = 2 * ob2(4) = 2 * ob2(5) = 3 * ... and so on. */ #define order_base_2(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 0 || (n) == 1) ? 0 : \ ilog2((n) - 1) + 1) : \ __order_base_2(n) \ ) static inline __attribute__((const)) int __bits_per(unsigned long n) { if (n < 2) return 1; if (is_power_of_2(n)) return order_base_2(n) + 1; return order_base_2(n); } /** * bits_per - calculate the number of bits required for the argument * @n: parameter * * This is constant-capable and can be used for compile time * initializations, e.g bitfields. * * The first few values calculated by this routine: * bf(0) = 1 * bf(1) = 1 * bf(2) = 2 * bf(3) = 2 * bf(4) = 3 * ... and so on. */ #define bits_per(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 0 || (n) == 1) \ ? 1 : ilog2(n) + 1 \ ) : \ __bits_per(n) \ ) #endif /* _LINUX_LOG2_H */
217 107 1 199 1 106 145 145 57 2 667 435 16 9 80 403 187 26 10 2 1 3 21 354 3 2 1 107 200 136 309 45 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include "disasm.h" #define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) static const char * const func_id_str[] = { __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) }; #undef __BPF_FUNC_STR_FN static const char *__func_get_name(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); if (!insn->src_reg && insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && func_id_str[insn->imm]) return func_id_str[insn->imm]; if (cbs && cbs->cb_call) { const char *res; res = cbs->cb_call(cbs->private_data, insn); if (res) return res; } if (insn->src_reg == BPF_PSEUDO_CALL) snprintf(buff, len, "%+d", insn->imm); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) snprintf(buff, len, "kernel-function"); return buff; } static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, u64 full_imm, char *buff, size_t len) { if (cbs && cbs->cb_imm) return cbs->cb_imm(cbs->private_data, insn, full_imm); snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); return buff; } const char *func_id_name(int id) { if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else return "unknown"; } const char *const bpf_class_string[8] = { [BPF_LD] = "ld", [BPF_LDX] = "ldx", [BPF_ST] = "st", [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; const char *const bpf_alu_string[16] = { [BPF_ADD >> 4] = "+=", [BPF_SUB >> 4] = "-=", [BPF_MUL >> 4] = "*=", [BPF_DIV >> 4] = "/=", [BPF_OR >> 4] = "|=", [BPF_AND >> 4] = "&=", [BPF_LSH >> 4] = "<<=", [BPF_RSH >> 4] = ">>=", [BPF_NEG >> 4] = "neg", [BPF_MOD >> 4] = "%=", [BPF_XOR >> 4] = "^=", [BPF_MOV >> 4] = "=", [BPF_ARSH >> 4] = "s>>=", [BPF_END >> 4] = "endian", }; static const char *const bpf_atomic_alu_string[16] = { [BPF_ADD >> 4] = "add", [BPF_AND >> 4] = "and", [BPF_OR >> 4] = "or", [BPF_XOR >> 4] = "xor", }; static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", [BPF_B >> 3] = "u8", [BPF_DW >> 3] = "u64", }; static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", [BPF_JLT >> 4] = "<", [BPF_JGE >> 4] = ">=", [BPF_JLE >> 4] = "<=", [BPF_JSET >> 4] = "&", [BPF_JNE >> 4] = "!=", [BPF_JSGT >> 4] = "s>", [BPF_JSLT >> 4] = "s<", [BPF_JSGE >> 4] = "s>=", [BPF_JSLE >> 4] = "s<=", [BPF_CALL >> 4] = "call", [BPF_EXIT >> 4] = "exit", }; static void print_bpf_end_insn(bpf_insn_print_t verbose, void *private_data, const struct bpf_insn *insn) { verbose(private_data, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) { const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { verbose(cbs->private_data, "(%02x) %c%d %s %d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], insn->imm); } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && (insn->imm == BPF_ADD || insn->imm == BPF_AND || insn->imm == BPF_OR || insn->imm == BPF_XOR)) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, bpf_alu_string[BPF_OP(insn->imm) >> 4], insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && (insn->imm == (BPF_ADD | BPF_FETCH) || insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH))) { verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_CMPXCHG) { verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", insn->code, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_XCHG) { verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) == BPF_MEM) { verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { verbose(cbs->private_data, "(%02x) nospec\n", insn->code); } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); } } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); } else if (BPF_MODE(insn->code) == BPF_IMM && BPF_SIZE(insn->code) == BPF_DW) { /* At this point, we already made sure that the second * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", insn->code, insn->dst_reg, __func_imm_name(cbs, insn, imm, tmp, sizeof(tmp))); } else { verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { char tmp[64]; if (insn->src_reg == BPF_PSEUDO_CALL) { verbose(cbs->private_data, "(%02x) call pc%s\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp))); } else { strcpy(tmp, "unknown"); verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp)), insn->imm); } } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) if %c%d %s %c%d goto pc%+d\n", insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { verbose(cbs->private_data, "(%02x) if %c%d %s 0x%x goto pc%+d\n", insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } }
6 5 27 27 1 4 3 2 1 13 13 13 13 13 13 23 23 23 2 21 6 1 5 5 4 5 1 5 4 3 11 1 1 9 4 6 1 1 1 1 4 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2018 Facebook */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> struct reuseport_array { struct bpf_map map; struct sock __rcu *ptrs[]; }; static struct reuseport_array *reuseport_array(struct bpf_map *map) { return (struct reuseport_array *)map; } /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { uintptr_t sk_user_data; write_lock_bh(&sk->sk_callback_lock); sk_user_data = (uintptr_t)sk->sk_user_data; if (sk_user_data & SK_USER_DATA_BPF) { struct sock __rcu **socks; socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of * sk->sk_callback_lock because there is * a race with reuseport_array_free() * which does not hold the reuseport_lock. */ RCU_INIT_POINTER(*socks, NULL); } write_unlock_bh(&sk->sk_callback_lock); } static int reuseport_array_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) return -EINVAL; return array_map_alloc_check(attr); } static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return rcu_dereference(array->ptrs[index]); } /* Called from syscall only */ static int reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; struct sock *sk; int err; if (index >= map->max_entries) return -E2BIG; if (!rcu_access_pointer(array->ptrs[index])) return -ENOENT; spin_lock_bh(&reuseport_lock); sk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); if (sk) { write_lock_bh(&sk->sk_callback_lock); WRITE_ONCE(sk->sk_user_data, NULL); RCU_INIT_POINTER(array->ptrs[index], NULL); write_unlock_bh(&sk->sk_callback_lock); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&reuseport_lock); return err; } static void reuseport_array_free(struct bpf_map *map) { struct reuseport_array *array = reuseport_array(map); struct sock *sk; u32 i; /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with * bpf_sk_reuseport_detach() which was triggered by * close() or disconnect(). * * This function and bpf_sk_reuseport_detach() are * both removing sk from "array". Who removes it * first does not matter. * * The only concern here is bpf_sk_reuseport_detach() * may access "array" which is being freed here. * bpf_sk_reuseport_detach() access this "array" * through sk->sk_user_data _and_ with sk->sk_callback_lock * held which is enough because this "array" is not freed * until all sk->sk_user_data has stopped referencing this "array". * * Hence, due to the above, taking "reuseport_lock" is not * needed here. */ /* * Since reuseport_lock is not taken, sk is accessed under * rcu_read_lock() */ rcu_read_lock(); for (i = 0; i < map->max_entries; i++) { sk = rcu_dereference(array->ptrs[i]); if (sk) { write_lock_bh(&sk->sk_callback_lock); /* * No need for WRITE_ONCE(). At this point, * no one is reading it without taking the * sk->sk_callback_lock. */ sk->sk_user_data = NULL; write_unlock_bh(&sk->sk_callback_lock); RCU_INIT_POINTER(array->ptrs[i], NULL); } } rcu_read_unlock(); /* * Once reaching here, all sk->sk_user_data is not * referenceing this "array". "array" can be freed now. */ bpf_map_area_free(array); } static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; u64 array_size; if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); return &array->map; } int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { struct sock *sk; int err; if (map->value_size != sizeof(u64)) return -ENOSPC; rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; } rcu_read_unlock(); return err; } static int reuseport_array_update_check(const struct reuseport_array *array, const struct sock *nsk, const struct sock *osk, const struct sock_reuseport *nsk_reuse, u32 map_flags) { if (osk && map_flags == BPF_NOEXIST) return -EEXIST; if (!osk && map_flags == BPF_EXIST) return -ENOENT; if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) return -ENOTSUPP; if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) return -ENOTSUPP; if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) return -ENOTSUPP; /* * sk must be hashed (i.e. listening in the TCP case or binded * in the UDP case) and * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). * * Also, sk will be used in bpf helper that is protected by * rcu_read_lock(). */ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) return -EINVAL; /* READ_ONCE because the sk->sk_callback_lock may not be held here */ if (READ_ONCE(nsk->sk_user_data)) return -EBUSY; return 0; } /* * Called from syscall only. * The "nsk" in the fd refcnt. * The "osk" and "reuse" are protected by reuseport_lock. */ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct reuseport_array *array = reuseport_array(map); struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; uintptr_t sk_user_data; struct socket *socket; int err, fd; if (map_flags > BPF_EXIST) return -EINVAL; if (index >= map->max_entries) return -E2BIG; if (map->value_size == sizeof(u64)) { u64 fd64 = *(u64 *)value; if (fd64 > S32_MAX) return -EINVAL; fd = fd64; } else { fd = *(int *)value; } socket = sockfd_lookup(fd, &err); if (!socket) return err; nsk = socket->sk; if (!nsk) { err = -EINVAL; goto put_file; } /* Quick checks before taking reuseport_lock */ err = reuseport_array_update_check(array, nsk, rcu_access_pointer(array->ptrs[index]), rcu_access_pointer(nsk->sk_reuseport_cb), map_flags); if (err) goto put_file; spin_lock_bh(&reuseport_lock); /* * Some of the checks only need reuseport_lock * but it is done under sk_callback_lock also * for simplicity reason. */ write_lock_bh(&nsk->sk_callback_lock); osk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); if (err) goto put_file_unlock; sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; put_file_unlock: write_unlock_bh(&nsk->sk_callback_lock); if (free_osk) { write_lock_bh(&free_osk->sk_callback_lock); WRITE_ONCE(free_osk->sk_user_data, NULL); write_unlock_bh(&free_osk->sk_callback_lock); } spin_unlock_bh(&reuseport_lock); put_file: fput(socket->file); return err; } /* Called from syscall */ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct reuseport_array *array = reuseport_array(map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, .map_btf_name = "reuseport_array", .map_btf_id = &reuseport_array_map_btf_id, };
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* * include/net/tipc.h: Include file for TIPC message header routines * * Copyright (c) 2017 Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_HDR_H #define _TIPC_HDR_H #include <linux/random.h> #define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */ struct tipc_basic_hdr { __be32 w[4]; }; static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) { u32 w0 = ntohl(hdr->w[0]); bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; __be32 key; /* Return source node identity as key */ if (likely(!keepalive_msg)) return hdr->w[3]; /* Spread PROBE/PROBE_REPLY messages across the cores */ get_random_bytes(&key, sizeof(key)); return key; } #endif
455 455 2 454 454 454 116 2 117 156 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - sysfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> #include <linux/module.h> #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" static int sysfs_do_create_link_sd(struct kernfs_node *parent, struct kobject *target_kobj, const char *name, int warn) { struct kernfs_node *kn, *target = NULL; if (WARN_ON(!name || !parent)) return -EINVAL; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (target_kobj->sd) { target = target_kobj->sd; kernfs_get(target); } spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; kn = kernfs_create_link(parent, name, target); kernfs_put(target); if (!IS_ERR(kn)) return 0; if (warn && PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (!parent) return -EFAULT; return sysfs_do_create_link_sd(parent, target, name, warn); } /** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 1); } EXPORT_SYMBOL_GPL(sysfs_create_link); /** * sysfs_create_link_nowarn - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. * * This function does the same as sysfs_create_link(), but it * doesn't warn if the link already exists. */ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 0); } EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn); /** * sysfs_delete_link - remove symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @name: name of the symlink to remove. * * Unlike sysfs_remove_link sysfs_delete_link has enough information * to successfully delete symlinks in tagged directories. */ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, const char *name) { const void *ns = NULL; /* * We don't own @target and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (targ->sd && kernfs_ns_enabled(kobj->sd)) ns = targ->sd->ns; spin_unlock(&sysfs_symlink_target_lock); kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. */ void sysfs_remove_link(struct kobject *kobj, const char *name) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); /** * sysfs_rename_link_ns - rename symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @old: previous name of the symlink. * @new: new name of the symlink. * @new_ns: new namespace of the symlink. * * A helper function for the common rename symlink idiom. */ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, const char *old, const char *new, const void *new_ns) { struct kernfs_node *parent, *kn = NULL; const void *old_ns = NULL; int result; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (targ->sd) old_ns = targ->sd->ns; result = -ENOENT; kn = kernfs_find_and_get_ns(parent, old, old_ns); if (!kn) goto out; result = -EINVAL; if (kernfs_type(kn) != KERNFS_LINK) goto out; if (kn->symlink.target_kn->priv != targ) goto out; result = kernfs_rename_ns(kn, parent, new, new_ns); out: kernfs_put(kn); return result; } EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
6 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 #undef TRACE_SYSTEM #define TRACE_SYSTEM bridge #if !defined(_TRACE_BRIDGE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BRIDGE_H #include <linux/netdevice.h> #include <linux/tracepoint.h> #include "../../../net/bridge/br_private.h" TRACE_EVENT(br_fdb_add, TP_PROTO(struct ndmsg *ndm, struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags), TP_ARGS(ndm, dev, addr, vid, nlh_flags), TP_STRUCT__entry( __field(u8, ndm_flags) __string(dev, dev->name) __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) __field(u16, nlh_flags) ), TP_fast_assign( __assign_str(dev, dev->name); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; __entry->nlh_flags = nlh_flags; __entry->ndm_flags = ndm->ndm_flags; ), TP_printk("dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u nlh_flags %04x ndm_flags %02x", __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid, __entry->nlh_flags, __entry->ndm_flags) ); TRACE_EVENT(br_fdb_external_learn_add, TP_PROTO(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid), TP_ARGS(br, p, addr, vid), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, p ? p->dev->name : "null") __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) ), TP_fast_assign( __assign_str(br_dev, br->dev->name); __assign_str(dev, p ? p->dev->name : "null"); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; ), TP_printk("br_dev %s port %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid) ); TRACE_EVENT(fdb_delete, TP_PROTO(struct net_bridge *br, struct net_bridge_fdb_entry *f), TP_ARGS(br, f), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, f->dst ? f->dst->dev->name : "null") __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) ), TP_fast_assign( __assign_str(br_dev, br->dev->name); __assign_str(dev, f->dst ? f->dst->dev->name : "null"); memcpy(__entry->addr, f->key.addr.addr, ETH_ALEN); __entry->vid = f->key.vlan_id; ), TP_printk("br_dev %s dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid) ); TRACE_EVENT(br_fdb_update, TP_PROTO(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags), TP_ARGS(br, source, addr, vid, flags), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, source->dev->name) __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) __field(unsigned long, flags) ), TP_fast_assign( __assign_str(br_dev, br->dev->name); __assign_str(dev, source->dev->name); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; __entry->flags = flags; ), TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u flags 0x%lx", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid, __entry->flags) ); #endif /* _TRACE_BRIDGE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 39 9 39 37 37 37 12 12 12 11 12 4 4 4 4 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 22 8 30 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 33 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 28 28 27 2 2 2 2 2 2 1 1 1 1 7 7 7 7 7 7 7 7 16 43 43 24 24 24 9 17 9 27 27 27 22 22 22 22 21 22 27 27 27 22 902 845 70 25 27 24 7 24 24 22 3 24 13 8 7 13 1 6 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 // SPDX-License-Identifier: GPL-2.0-only /* * This is the linux wireless configuration interface. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2022 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> #include <linux/module.h> #include <linux/err.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/nl80211.h> #include <linux/debugfs.h> #include <linux/notifier.h> #include <linux/device.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/sched.h> #include <net/genetlink.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "sysfs.h" #include "debugfs.h" #include "wext-compat.h" #include "rdev-ops.h" /* name for sysfs, %d is appended */ #define PHY_NAME "phy" MODULE_AUTHOR("Johannes Berg"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("wireless configuration support"); MODULE_ALIAS_GENL_FAMILY(NL80211_GENL_NAME); /* RCU-protected (and RTNL for writers) */ LIST_HEAD(cfg80211_rdev_list); int cfg80211_rdev_list_generation; /* for debugfs */ static struct dentry *ieee80211_debugfs_dir; /* for the cleanup, scan and event works */ struct workqueue_struct *cfg80211_wq; static bool cfg80211_disable_40mhz_24ghz; module_param(cfg80211_disable_40mhz_24ghz, bool, 0644); MODULE_PARM_DESC(cfg80211_disable_40mhz_24ghz, "Disable 40MHz support in the 2.4GHz band"); struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx) { struct cfg80211_registered_device *result = NULL, *rdev; ASSERT_RTNL(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) { if (rdev->wiphy_idx == wiphy_idx) { result = rdev; break; } } return result; } int get_wiphy_idx(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); return rdev->wiphy_idx; } struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx); if (!rdev) return NULL; return &rdev->wiphy; } static int cfg80211_dev_check_name(struct cfg80211_registered_device *rdev, const char *newname) { struct cfg80211_registered_device *rdev2; int wiphy_idx, taken = -1, digits; ASSERT_RTNL(); if (strlen(newname) > NL80211_WIPHY_NAME_MAXLEN) return -EINVAL; /* prohibit calling the thing phy%d when %d is not its number */ sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { /* count number of places needed to print wiphy_idx */ digits = 1; while (wiphy_idx /= 10) digits++; /* * deny the name if it is phy<idx> where <idx> is printed * without leading zeroes. taken == strlen(newname) here */ if (taken == strlen(PHY_NAME) + digits) return -EINVAL; } /* Ensure another device does not already have this name. */ list_for_each_entry(rdev2, &cfg80211_rdev_list, list) if (strcmp(newname, wiphy_name(&rdev2->wiphy)) == 0) return -EINVAL; return 0; } int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname) { int result; ASSERT_RTNL(); /* Ignore nop renames */ if (strcmp(newname, wiphy_name(&rdev->wiphy)) == 0) return 0; result = cfg80211_dev_check_name(rdev, newname); if (result < 0) return result; result = device_rename(&rdev->wiphy.dev, newname); if (result) return result; if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir)) debugfs_rename(rdev->wiphy.debugfsdir->d_parent, rdev->wiphy.debugfsdir, rdev->wiphy.debugfsdir->d_parent, newname); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); return 0; } int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, struct net *net) { struct wireless_dev *wdev; int err = 0; if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK)) return -EOPNOTSUPP; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } if (err) { /* failed -- clean up to old netns */ net = wiphy_net(&rdev->wiphy); list_for_each_entry_continue_reverse(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); WARN_ON(err); wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } return err; } list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); } nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); wiphy_net_set(&rdev->wiphy, net); err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev)); WARN_ON(err); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } return 0; } static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) { struct cfg80211_registered_device *rdev = data; wiphy_lock(&rdev->wiphy); rdev_rfkill_poll(rdev); wiphy_unlock(&rdev->wiphy); } void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)) return; if (!wdev_running(wdev)) return; rdev_stop_p2p_device(rdev, wdev); wdev->is_running = false; rdev->opencount--; if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } } void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) return; if (!wdev_running(wdev)) return; rdev_stop_nan(rdev, wdev); wdev->is_running = false; rdev->opencount--; } void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct wireless_dev *wdev; ASSERT_RTNL(); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->netdev) { dev_close(wdev->netdev); continue; } /* otherwise, check iftype */ wiphy_lock(wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; case NL80211_IFTYPE_NAN: cfg80211_stop_nan(rdev, wdev); break; default: break; } wiphy_unlock(wiphy); } } EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); static int cfg80211_rfkill_set_block(void *data, bool blocked) { struct cfg80211_registered_device *rdev = data; if (!blocked) return 0; rtnl_lock(); cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return 0; } static void cfg80211_rfkill_block_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, rfkill_block); cfg80211_rfkill_set_block(rdev, true); } static void cfg80211_event_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, event_work); wiphy_lock(&rdev->wiphy); cfg80211_process_rdev_events(rdev); wiphy_unlock(&rdev->wiphy); } void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) { if (wdev->nl_owner_dead) { if (wdev->netdev) dev_close(wdev->netdev); wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); rdev_del_virtual_intf(rdev, wdev); wiphy_unlock(&rdev->wiphy); } } } static void cfg80211_destroy_iface_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, destroy_work); rtnl_lock(); cfg80211_destroy_ifaces(rdev); rtnl_unlock(); } static void cfg80211_sched_scan_stop_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_stop_wk); wiphy_lock(&rdev->wiphy); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->nl_owner_dead) cfg80211_stop_sched_scan_req(rdev, req, false); } wiphy_unlock(&rdev->wiphy); } static void cfg80211_propagate_radar_detect_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, propagate_radar_detect_wk); rtnl_lock(); regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->radar_chandef, NL80211_DFS_UNAVAILABLE, NL80211_RADAR_DETECTED); rtnl_unlock(); } static void cfg80211_propagate_cac_done_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, propagate_cac_done_wk); rtnl_lock(); regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->cac_done_chandef, NL80211_DFS_AVAILABLE, NL80211_RADAR_CAC_FINISHED); rtnl_unlock(); } /* exported functions */ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, const char *requested_name) { static atomic_t wiphy_counter = ATOMIC_INIT(0); struct cfg80211_registered_device *rdev; int alloc_size; WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key)); WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc)); WARN_ON(ops->connect && !ops->disconnect); WARN_ON(ops->join_ibss && !ops->leave_ibss); WARN_ON(ops->add_virtual_intf && !ops->del_virtual_intf); WARN_ON(ops->add_station && !ops->del_station); WARN_ON(ops->add_mpath && !ops->del_mpath); WARN_ON(ops->join_mesh && !ops->leave_mesh); WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device); WARN_ON(ops->start_ap && !ops->stop_ap); WARN_ON(ops->join_ocb && !ops->leave_ocb); WARN_ON(ops->suspend && !ops->resume); WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop); WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel); WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch); WARN_ON(ops->add_tx_ts && !ops->del_tx_ts); alloc_size = sizeof(*rdev) + sizeof_priv; rdev = kzalloc(alloc_size, GFP_KERNEL); if (!rdev) return NULL; rdev->ops = ops; rdev->wiphy_idx = atomic_inc_return(&wiphy_counter); if (unlikely(rdev->wiphy_idx < 0)) { /* ugh, wrapped! */ atomic_dec(&wiphy_counter); kfree(rdev); return NULL; } /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wiphy_idx--; /* give it a proper name */ if (requested_name && requested_name[0]) { int rv; rtnl_lock(); rv = cfg80211_dev_check_name(rdev, requested_name); if (rv < 0) { rtnl_unlock(); goto use_default_name; } rv = dev_set_name(&rdev->wiphy.dev, "%s", requested_name); rtnl_unlock(); if (rv) goto use_default_name; } else { int rv; use_default_name: /* NOTE: This is *probably* safe w/out holding rtnl because of * the restrictions on phy names. Probably this call could * fail if some other part of the kernel (re)named a device * phyX. But, might should add some locking and check return * value, and use a different name if this one exists? */ rv = dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); if (rv < 0) { kfree(rdev); return NULL; } } mutex_init(&rdev->wiphy.mtx); INIT_LIST_HEAD(&rdev->wiphy.wdev_list); INIT_LIST_HEAD(&rdev->beacon_registrations); spin_lock_init(&rdev->beacon_registrations_lock); spin_lock_init(&rdev->bss_lock); INIT_LIST_HEAD(&rdev->bss_list); INIT_LIST_HEAD(&rdev->sched_scan_req_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT rdev->wiphy.wext = &cfg80211_wext_handler; #endif device_initialize(&rdev->wiphy.dev); rdev->wiphy.dev.class = &ieee80211_class; rdev->wiphy.dev.platform_data = rdev; device_enable_async_suspend(&rdev->wiphy.dev); INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk); INIT_WORK(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk); INIT_WORK(&rdev->sched_scan_res_wk, cfg80211_sched_scan_results_wk); INIT_WORK(&rdev->propagate_radar_detect_wk, cfg80211_propagate_radar_detect_wk); INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); INIT_WORK(&rdev->mgmt_registrations_update_wk, cfg80211_mgmt_registrations_update_wk); spin_lock_init(&rdev->mgmt_registrations_lock); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; #endif wiphy_net_set(&rdev->wiphy, &init_net); rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block; rdev->wiphy.rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), &rdev->wiphy.dev, RFKILL_TYPE_WLAN, &rdev->rfkill_ops, rdev); if (!rdev->wiphy.rfkill) { wiphy_free(&rdev->wiphy); return NULL; } INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work); INIT_WORK(&rdev->conn_work, cfg80211_conn_work); INIT_WORK(&rdev->event_work, cfg80211_event_work); init_waitqueue_head(&rdev->dev_wait); /* * Initialize wiphy parameters to IEEE 802.11 MIB default values. * Fragmentation and RTS threshold are disabled by default with the * special -1 value. */ rdev->wiphy.retry_short = 7; rdev->wiphy.retry_long = 4; rdev->wiphy.frag_threshold = (u32) -1; rdev->wiphy.rts_threshold = (u32) -1; rdev->wiphy.coverage_class = 0; rdev->wiphy.max_num_csa_counters = 1; rdev->wiphy.max_sched_scan_plans = 1; rdev->wiphy.max_sched_scan_plan_interval = U32_MAX; return &rdev->wiphy; } EXPORT_SYMBOL(wiphy_new_nm); static int wiphy_verify_combinations(struct wiphy *wiphy) { const struct ieee80211_iface_combination *c; int i, j; for (i = 0; i < wiphy->n_iface_combinations; i++) { u32 cnt = 0; u16 all_iftypes = 0; c = &wiphy->iface_combinations[i]; /* * Combinations with just one interface aren't real, * however we make an exception for DFS. */ if (WARN_ON((c->max_interfaces < 2) && !c->radar_detect_widths)) return -EINVAL; /* Need at least one channel */ if (WARN_ON(!c->num_different_channels)) return -EINVAL; /* DFS only works on one channel. */ if (WARN_ON(c->radar_detect_widths && (c->num_different_channels > 1))) return -EINVAL; if (WARN_ON(!c->n_limits)) return -EINVAL; for (j = 0; j < c->n_limits; j++) { u16 types = c->limits[j].types; /* interface types shouldn't overlap */ if (WARN_ON(types & all_iftypes)) return -EINVAL; all_iftypes |= types; if (WARN_ON(!c->limits[j].max)) return -EINVAL; /* Shouldn't list software iftypes in combinations! */ if (WARN_ON(wiphy->software_iftypes & types)) return -EINVAL; /* Only a single P2P_DEVICE can be allowed */ if (WARN_ON(types & BIT(NL80211_IFTYPE_P2P_DEVICE) && c->limits[j].max > 1)) return -EINVAL; /* Only a single NAN can be allowed */ if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && c->limits[j].max > 1)) return -EINVAL; /* * This isn't well-defined right now. If you have an * IBSS interface, then its beacon interval may change * by joining other networks, and nothing prevents it * from doing that. * So technically we probably shouldn't even allow AP * and IBSS in the same interface, but it seems that * some drivers support that, possibly only with fixed * beacon intervals for IBSS. */ if (WARN_ON(types & BIT(NL80211_IFTYPE_ADHOC) && c->beacon_int_min_gcd)) { return -EINVAL; } cnt += c->limits[j].max; /* * Don't advertise an unsupported type * in a combination. */ if (WARN_ON((wiphy->interface_modes & types) != types)) return -EINVAL; } if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; /* You can't even choose that many! */ if (WARN_ON(cnt < c->max_interfaces)) return -EINVAL; } return 0; } int wiphy_register(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); int res; enum nl80211_band band; struct ieee80211_supported_band *sband; bool have_band = false; int i; u16 ifmodes = wiphy->interface_modes; #ifdef CONFIG_PM if (WARN_ON(wiphy->wowlan && (wiphy->wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && !(wiphy->wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY))) return -EINVAL; if (WARN_ON(wiphy->wowlan && !wiphy->wowlan->flags && !wiphy->wowlan->n_patterns && !wiphy->wowlan->tcp)) return -EINVAL; #endif if (WARN_ON((wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH) && (!rdev->ops->tdls_channel_switch || !rdev->ops->tdls_cancel_channel_switch))) return -EINVAL; if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && (!rdev->ops->start_nan || !rdev->ops->stop_nan || !rdev->ops->add_nan_func || !rdev->ops->del_nan_func || !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ))))) return -EINVAL; if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported)) return -EINVAL; if (wiphy->pmsr_capa && wiphy->pmsr_capa->ftm.supported) { if (WARN_ON(!wiphy->pmsr_capa->ftm.asap && !wiphy->pmsr_capa->ftm.non_asap)) return -EINVAL; if (WARN_ON(!wiphy->pmsr_capa->ftm.preambles || !wiphy->pmsr_capa->ftm.bandwidths)) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa->ftm.preambles & ~(BIT(NL80211_PREAMBLE_LEGACY) | BIT(NL80211_PREAMBLE_HT) | BIT(NL80211_PREAMBLE_VHT) | BIT(NL80211_PREAMBLE_HE) | BIT(NL80211_PREAMBLE_DMG)))) return -EINVAL; if (WARN_ON((wiphy->pmsr_capa->ftm.trigger_based || wiphy->pmsr_capa->ftm.non_trigger_based) && !(wiphy->pmsr_capa->ftm.preambles & BIT(NL80211_PREAMBLE_HE)))) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths & ~(BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_80P80) | BIT(NL80211_CHAN_WIDTH_160) | BIT(NL80211_CHAN_WIDTH_5) | BIT(NL80211_CHAN_WIDTH_10)))) return -EINVAL; } /* * if a wiphy has unsupported modes for regulatory channel enforcement, * opt-out of enforcement checking */ if (wiphy->interface_modes & ~(BIT(NL80211_IFTYPE_STATION) | BIT(NL80211_IFTYPE_P2P_CLIENT) | BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_P2P_GO) | BIT(NL80211_IFTYPE_ADHOC) | BIT(NL80211_IFTYPE_P2P_DEVICE) | BIT(NL80211_IFTYPE_NAN) | BIT(NL80211_IFTYPE_AP_VLAN) | BIT(NL80211_IFTYPE_MONITOR))) wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF; if (WARN_ON((wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) && (wiphy->regulatory_flags & (REGULATORY_CUSTOM_REG | REGULATORY_STRICT_REG | REGULATORY_COUNTRY_IE_FOLLOW_POWER | REGULATORY_COUNTRY_IE_IGNORE)))) return -EINVAL; if (WARN_ON(wiphy->coalesce && (!wiphy->coalesce->n_rules || !wiphy->coalesce->n_patterns) && (!wiphy->coalesce->pattern_min_len || wiphy->coalesce->pattern_min_len > wiphy->coalesce->pattern_max_len))) return -EINVAL; if (WARN_ON(wiphy->ap_sme_capa && !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME))) return -EINVAL; if (WARN_ON(wiphy->addresses && !wiphy->n_addresses)) return -EINVAL; if (WARN_ON(wiphy->addresses && !is_zero_ether_addr(wiphy->perm_addr) && memcmp(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN))) return -EINVAL; if (WARN_ON(wiphy->max_acl_mac_addrs && (!(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME) || !rdev->ops->set_mac_acl))) return -EINVAL; /* assure only valid behaviours are flagged by driver * hence subtract 2 as bit 0 is invalid. */ if (WARN_ON(wiphy->bss_select_support && (wiphy->bss_select_support & ~(BIT(__NL80211_BSS_SELECT_ATTR_AFTER_LAST) - 2)))) return -EINVAL; if (WARN_ON(wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X) && (!rdev->ops->set_pmk || !rdev->ops->del_pmk))) return -EINVAL; if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && rdev->ops->update_connect_params)) return -EINVAL; if (wiphy->addresses) memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); /* sanity check ifmodes */ WARN_ON(!ifmodes); ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1; if (WARN_ON(ifmodes != wiphy->interface_modes)) wiphy->interface_modes = ifmodes; res = wiphy_verify_combinations(wiphy); if (res) return res; /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { u16 types = 0; bool have_he = false; sband = wiphy->bands[band]; if (!sband) continue; sband->band = band; if (WARN_ON(!sband->n_channels)) return -EINVAL; /* * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ if (WARN_ON((band != NL80211_BAND_60GHZ && band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; if (WARN_ON(band == NL80211_BAND_6GHZ && (sband->ht_cap.ht_supported || sband->vht_cap.vht_supported))) return -EINVAL; /* * Since cfg80211_disable_40mhz_24ghz is global, we can * modify the sband's ht data even if the driver uses a * global structure for that. */ if (cfg80211_disable_40mhz_24ghz && band == NL80211_BAND_2GHZ && sband->ht_cap.ht_supported) { sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40; } /* * Since we use a u32 for rate bitmaps in * ieee80211_get_response_rate, we cannot * have more than 32 legacy rates. */ if (WARN_ON(sband->n_bitrates > 32)) return -EINVAL; for (i = 0; i < sband->n_channels; i++) { sband->channels[i].orig_flags = sband->channels[i].flags; sband->channels[i].orig_mag = INT_MAX; sband->channels[i].orig_mpwr = sband->channels[i].max_power; sband->channels[i].band = band; if (WARN_ON(sband->channels[i].freq_offset >= 1000)) return -EINVAL; } for (i = 0; i < sband->n_iftype_data; i++) { const struct ieee80211_sband_iftype_data *iftd; iftd = &sband->iftype_data[i]; if (WARN_ON(!iftd->types_mask)) return -EINVAL; if (WARN_ON(types & iftd->types_mask)) return -EINVAL; /* at least one piece of information must be present */ if (WARN_ON(!iftd->he_cap.has_he)) return -EINVAL; types |= iftd->types_mask; if (i == 0) have_he = iftd->he_cap.has_he; else have_he = have_he && iftd->he_cap.has_he; } if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ)) return -EINVAL; have_band = true; } if (!have_band) { WARN_ON(1); return -EINVAL; } for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { /* * Validate we have a policy (can be explicitly set to * VENDOR_CMD_RAW_DATA which is non-NULL) and also that * we have at least one of doit/dumpit. */ if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy)) return -EINVAL; if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit && !rdev->wiphy.vendor_commands[i].dumpit)) return -EINVAL; } #ifdef CONFIG_PM if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns && (!rdev->wiphy.wowlan->pattern_min_len || rdev->wiphy.wowlan->pattern_min_len > rdev->wiphy.wowlan->pattern_max_len))) return -EINVAL; #endif /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; rtnl_lock(); res = device_add(&rdev->wiphy.dev); if (res) { rtnl_unlock(); return res; } list_add_rcu(&rdev->list, &cfg80211_rdev_list); cfg80211_rdev_list_generation++; /* add to debugfs */ rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy), ieee80211_debugfs_dir); cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); /* set up regulatory info */ wiphy_regulatory_register(wiphy); if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { struct regulatory_request request; request.wiphy_idx = get_wiphy_idx(wiphy); request.initiator = NL80211_REGDOM_SET_BY_DRIVER; request.alpha2[0] = '9'; request.alpha2[1] = '9'; nl80211_send_reg_change_event(&request); } /* Check that nobody globally advertises any capabilities they do not * advertise on all possible interface types. */ if (wiphy->extended_capabilities_len && wiphy->num_iftype_ext_capab && wiphy->iftype_ext_capab) { u8 supported_on_all, j; const struct wiphy_iftype_ext_capab *capab; capab = wiphy->iftype_ext_capab; for (j = 0; j < wiphy->extended_capabilities_len; j++) { if (capab[0].extended_capabilities_len > j) supported_on_all = capab[0].extended_capabilities[j]; else supported_on_all = 0x00; for (i = 1; i < wiphy->num_iftype_ext_capab; i++) { if (j >= capab[i].extended_capabilities_len) { supported_on_all = 0x00; break; } supported_on_all &= capab[i].extended_capabilities[j]; } if (WARN_ON(wiphy->extended_capabilities[j] & ~supported_on_all)) break; } } rdev->wiphy.registered = true; rtnl_unlock(); res = rfkill_register(rdev->wiphy.rfkill); if (res) { rfkill_destroy(rdev->wiphy.rfkill); rdev->wiphy.rfkill = NULL; wiphy_unregister(&rdev->wiphy); return res; } return 0; } EXPORT_SYMBOL(wiphy_register); void wiphy_rfkill_start_polling(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!rdev->ops->rfkill_poll) return; rdev->rfkill_ops.poll = cfg80211_rfkill_poll; rfkill_resume_polling(wiphy->rfkill); } EXPORT_SYMBOL(wiphy_rfkill_start_polling); void wiphy_unregister(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); wait_event(rdev->dev_wait, ({ int __count; wiphy_lock(&rdev->wiphy); __count = rdev->opencount; wiphy_unlock(&rdev->wiphy); __count == 0; })); if (rdev->wiphy.rfkill) rfkill_unregister(rdev->wiphy.rfkill); rtnl_lock(); wiphy_lock(&rdev->wiphy); nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); rdev->wiphy.registered = false; WARN_ON(!list_empty(&rdev->wiphy.wdev_list)); /* * First remove the hardware from everywhere, this makes * it impossible to find from userspace. */ debugfs_remove_recursive(rdev->wiphy.debugfsdir); list_del_rcu(&rdev->list); synchronize_rcu(); /* * If this device got a regulatory hint tell core its * free to listen now to a new shiny device regulatory hint */ wiphy_regulatory_deregister(wiphy); cfg80211_rdev_list_generation++; device_del(&rdev->wiphy.dev); wiphy_unlock(&rdev->wiphy); rtnl_unlock(); flush_work(&rdev->scan_done_wk); cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); flush_work(&rdev->mgmt_registrations_update_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) rdev_set_wakeup(rdev, false); #endif cfg80211_rdev_free_wowlan(rdev); cfg80211_rdev_free_coalesce(rdev); } EXPORT_SYMBOL(wiphy_unregister); void cfg80211_dev_free(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *scan, *tmp; struct cfg80211_beacon_registration *reg, *treg; rfkill_destroy(rdev->wiphy.rfkill); list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) { list_del(&reg->list); kfree(reg); } list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) cfg80211_put_bss(&rdev->wiphy, &scan->pub); mutex_destroy(&rdev->wiphy.mtx); /* * The 'regd' can only be non-NULL if we never finished * initializing the wiphy and thus never went through the * unregister path - e.g. in failure scenarios. Thus, it * cannot have been visible to anyone if non-NULL, so we * can just free it here. */ kfree(rcu_dereference_raw(rdev->wiphy.regd)); kfree(rdev); } void wiphy_free(struct wiphy *wiphy) { put_device(&wiphy->dev); } EXPORT_SYMBOL(wiphy_free); void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, enum rfkill_hard_block_reasons reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (rfkill_set_hw_state_reason(wiphy->rfkill, blocked, reason)) schedule_work(&rdev->rfkill_block); } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); void cfg80211_cqm_config_free(struct wireless_dev *wdev) { kfree(wdev->cqm_config); wdev->cqm_config = NULL; } static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); flush_work(&wdev->pmsr_free_wk); nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); wdev->registered = false; if (wdev->netdev) { sysfs_remove_link(&wdev->netdev->dev.kobj, "phy80211"); if (unregister_netdev) unregister_netdevice(wdev->netdev); } list_del_rcu(&wdev->list); synchronize_net(); rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; case NL80211_IFTYPE_NAN: cfg80211_stop_nan(rdev, wdev); break; default: break; } #ifdef CONFIG_CFG80211_WEXT kfree_sensitive(wdev->wext.keys); wdev->wext.keys = NULL; #endif /* only initialized if we have a netdev */ if (wdev->netdev) flush_work(&wdev->disconnect_wk); cfg80211_cqm_config_free(wdev); /* * Ensure that all events have been processed and * freed. */ cfg80211_process_wdev_events(wdev); if (WARN_ON(wdev->current_bss)) { cfg80211_unhold_bss(wdev->current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; } } void cfg80211_unregister_wdev(struct wireless_dev *wdev) { _cfg80211_unregister_wdev(wdev, true); } EXPORT_SYMBOL(cfg80211_unregister_wdev); static const struct device_type wiphy_type = { .name = "wlan", }; void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num) { lockdep_assert_held(&rdev->wiphy.mtx); rdev->num_running_ifaces += num; if (iftype == NL80211_IFTYPE_MONITOR) rdev->num_running_monitor_ifaces += num; } void __cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct net_device *dev = wdev->netdev; struct cfg80211_sched_scan_request *pos, *tmp; lockdep_assert_held(&rdev->wiphy.mtx); ASSERT_WDEV_LOCK(wdev); cfg80211_pmsr_wdev_down(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: __cfg80211_leave_ibss(rdev, dev, true); break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, list) { if (dev == pos->dev) cfg80211_stop_sched_scan_req(rdev, pos, false); } #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.ie); wdev->wext.ie = NULL; wdev->wext.ie_len = 0; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: __cfg80211_leave_mesh(rdev, dev); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: __cfg80211_stop_ap(rdev, dev, true); break; case NL80211_IFTYPE_OCB: __cfg80211_leave_ocb(rdev, dev); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: /* cannot happen, has no netdev */ break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: /* invalid */ break; } } void cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { wdev_lock(wdev); __cfg80211_leave(rdev, wdev); wdev_unlock(wdev); } void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_event *ev; unsigned long flags; trace_cfg80211_stop_iface(wiphy, wdev); ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_STOPPED; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_stop_iface); void cfg80211_init_wdev(struct wireless_dev *wdev) { mutex_init(&wdev->mtx); INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); #ifdef CONFIG_CFG80211_WEXT wdev->wext.default_key = -1; wdev->wext.default_mgmt_key = -1; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) wdev->ps = true; else wdev->ps = false; /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) wdev->netdev->priv_flags |= IFF_DONT_BRIDGE; INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); } void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); /* * We get here also when the interface changes network namespaces, * as it's registered into the new one, but we don't want it to * change ID in that case. Checking if the ID is already assigned * works, because 0 isn't considered a valid ID and the memory is * 0-initialized. */ if (!wdev->identifier) wdev->identifier = ++rdev->wdev_id; list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; wdev->registered = true; if (wdev->netdev && sysfs_create_link(&wdev->netdev->dev.kobj, &rdev->wiphy.dev.kobj, "phy80211")) pr_err("failed to add phy80211 symlink to netdev!\n"); nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } int cfg80211_register_netdevice(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; int ret; ASSERT_RTNL(); if (WARN_ON(!wdev)) return -EINVAL; rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_held(&rdev->wiphy.mtx); /* we'll take care of this */ wdev->registered = true; wdev->registering = true; ret = register_netdevice(dev); if (ret) goto out; cfg80211_register_wdev(rdev, wdev); ret = 0; out: wdev->registering = false; if (ret) wdev->registered = false; return ret; } EXPORT_SYMBOL(cfg80211_register_netdevice); static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *pos, *tmp; if (!wdev) return NOTIFY_DONE; rdev = wiphy_to_rdev(wdev->wiphy); WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED); switch (state) { case NETDEV_POST_INIT: SET_NETDEV_DEVTYPE(dev, &wiphy_type); wdev->netdev = dev; /* can only change netns with wiphy */ dev->features |= NETIF_F_NETNS_LOCAL; cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: if (!wdev->registered) { wiphy_lock(&rdev->wiphy); cfg80211_register_wdev(rdev, wdev); wiphy_unlock(&rdev->wiphy); } break; case NETDEV_UNREGISTER: /* * It is possible to get NETDEV_UNREGISTER multiple times, * so check wdev->registered. */ if (wdev->registered && !wdev->registering) { wiphy_lock(&rdev->wiphy); _cfg80211_unregister_wdev(wdev, false); wiphy_unlock(&rdev->wiphy); } break; case NETDEV_GOING_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); wiphy_unlock(&rdev->wiphy); break; case NETDEV_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, list) { if (WARN_ON(pos->dev == wdev->netdev)) cfg80211_stop_sched_scan_req(rdev, pos, false); } rdev->opencount--; wiphy_unlock(&rdev->wiphy); wake_up(&rdev->dev_wait); break; case NETDEV_UP: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, 1); wdev_lock(wdev); switch (wdev->iftype) { #ifdef CONFIG_CFG80211_WEXT case NL80211_IFTYPE_ADHOC: cfg80211_ibss_wext_join(rdev, wdev); break; case NL80211_IFTYPE_STATION: cfg80211_mgd_wext_connect(rdev, wdev); break; #endif #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: { /* backward compat code... */ struct mesh_setup setup; memcpy(&setup, &default_mesh_setup, sizeof(setup)); /* back compat only needed for mesh_id */ setup.mesh_id = wdev->ssid; setup.mesh_id_len = wdev->mesh_id_up_len; if (wdev->mesh_id_up_len) __cfg80211_join_mesh(rdev, dev, &setup, &default_mesh_config); break; } #endif default: break; } wdev_unlock(wdev); rdev->opencount++; /* * Configure power management to the driver here so that its * correctly set also after interface type changes etc. */ if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && rdev->ops->set_power_mgmt && rdev_set_power_mgmt(rdev, dev, wdev->ps, wdev->ps_timeout)) { /* assume this means it's off */ wdev->ps = false; } wiphy_unlock(&rdev->wiphy); break; case NETDEV_PRE_UP: if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype, wdev->use_4addr, 0)) return notifier_from_errno(-EOPNOTSUPP); if (rfkill_blocked(rdev->wiphy.rfkill)) return notifier_from_errno(-ERFKILL); break; default: return NOTIFY_DONE; } wireless_nlevent_flush(); return NOTIFY_OK; } static struct notifier_block cfg80211_netdev_notifier = { .notifier_call = cfg80211_netdev_notifier_call, }; static void __net_exit cfg80211_pernet_exit(struct net *net) { struct cfg80211_registered_device *rdev; rtnl_lock(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) { if (net_eq(wiphy_net(&rdev->wiphy), net)) WARN_ON(cfg80211_switch_netns(rdev, &init_net)); } rtnl_unlock(); } static struct pernet_operations cfg80211_pernet_ops = { .exit = cfg80211_pernet_exit, }; static int __init cfg80211_init(void) { int err; err = register_pernet_device(&cfg80211_pernet_ops); if (err) goto out_fail_pernet; err = wiphy_sysfs_init(); if (err) goto out_fail_sysfs; err = register_netdevice_notifier(&cfg80211_netdev_notifier); if (err) goto out_fail_notifier; err = nl80211_init(); if (err) goto out_fail_nl80211; ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL); err = regulatory_init(); if (err) goto out_fail_reg; cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM); if (!cfg80211_wq) { err = -ENOMEM; goto out_fail_wq; } return 0; out_fail_wq: regulatory_exit(); out_fail_reg: debugfs_remove(ieee80211_debugfs_dir); nl80211_exit(); out_fail_nl80211: unregister_netdevice_notifier(&cfg80211_netdev_notifier); out_fail_notifier: wiphy_sysfs_exit(); out_fail_sysfs: unregister_pernet_device(&cfg80211_pernet_ops); out_fail_pernet: return err; } fs_initcall(cfg80211_init); static void __exit cfg80211_exit(void) { debugfs_remove(ieee80211_debugfs_dir); nl80211_exit(); unregister_netdevice_notifier(&cfg80211_netdev_notifier); wiphy_sysfs_exit(); regulatory_exit(); unregister_pernet_device(&cfg80211_pernet_ops); destroy_workqueue(cfg80211_wq); } module_exit(cfg80211_exit);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct wol_req_info { struct ethnl_req_info base; }; struct wol_reply_data { struct ethnl_reply_data base; struct ethtool_wolinfo wol; bool show_sopass; }; #define WOL_REPDATA(__reply_base) \ container_of(__reply_base, struct wol_reply_data, base) const struct nla_policy ethnl_wol_get_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int wol_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) { struct wol_reply_data *data = WOL_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_wol(dev, &data->wol); ethnl_ops_complete(dev); /* do not include password in notifications */ data->show_sopass = info && (data->wol.supported & WAKE_MAGICSECURE); return 0; } static int wol_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int len; len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (len < 0) return len; if (data->show_sopass) len += nla_total_size(sizeof(data->wol.sopass)); return len; } static int wol_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (ret < 0) return ret; if (data->show_sopass && nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass), data->wol.sopass)) return -EMSGSIZE; return 0; } const struct ethnl_request_ops ethnl_wol_request_ops = { .request_cmd = ETHTOOL_MSG_WOL_GET, .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, .hdr_attr = ETHTOOL_A_WOL_HEADER, .req_info_size = sizeof(struct wol_req_info), .reply_data_size = sizeof(struct wol_reply_data), .prepare_data = wol_prepare_data, .reply_size = wol_reply_size, .fill_reply = wol_fill_reply, }; /* WOL_SET */ const struct nla_policy ethnl_wol_set_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED }, [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY, .len = SOPASS_MAX }, }; int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) { struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; bool mod = false; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_WOL_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; dev->ethtool_ops->get_wol(dev, &wol); ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, tb[ETHTOOL_A_WOL_MODES], wol_mode_names, info->extack, &mod); if (ret < 0) goto out_ops; if (wol.wolopts & ~wol.supported) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], "cannot enable unsupported WoL mode"); ret = -EINVAL; goto out_ops; } if (tb[ETHTOOL_A_WOL_SOPASS]) { if (!(wol.supported & WAKE_MAGICSECURE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_SOPASS], "magicsecure not supported, cannot set password"); ret = -EINVAL; goto out_ops; } ethnl_update_binary(wol.sopass, sizeof(wol.sopass), tb[ETHTOOL_A_WOL_SOPASS], &mod); } if (!mod) goto out_ops; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) goto out_ops; dev->wol_enabled = !!wol.wolopts; ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); out_dev: dev_put(dev); return ret; }
264 2 265 263 202 246 902 201 903 117 51 65 27 317 157 220 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_nat_masquerade.h> struct masq_dev_work { struct work_struct work; struct net *net; union nf_inet_addr addr; int ifindex; int (*iter)(struct nf_conn *i, void *data); }; #define MAX_MASQ_WORKER_COUNT 16 static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; rt = skb_rtable(skb); nh = rt_nexthop(rt, ip_hdr(skb)->daddr); newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; } nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); static void iterate_cleanup_work(struct work_struct *work) { struct masq_dev_work *w; w = container_of(work, struct masq_dev_work, work); nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); put_net(w->net); kfree(w); atomic_dec(&masq_worker_count); module_put(THIS_MODULE); } /* Iterate conntrack table in the background and remove conntrack entries * that use the device/address being removed. * * In case too many work items have been queued already or memory allocation * fails iteration is skipped, conntrack entries will time out eventually. */ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, int ifindex, int (*iter)(struct nf_conn *i, void *data), gfp_t gfp_flags) { struct masq_dev_work *w; if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) return; net = maybe_get_net(net); if (!net) return; if (!try_module_get(THIS_MODULE)) goto err_module; w = kzalloc(sizeof(*w), gfp_flags); if (w) { /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ atomic_inc(&masq_worker_count); INIT_WORK(&w->work, iterate_cleanup_work); w->ifindex = ifindex; w->net = net; w->iter = iter; if (addr) w->addr = *addr; schedule_work(&w->work); return; } module_put(THIS_MODULE); err_module: put_net(net); } static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); const struct masq_dev_work *w = arg; if (!nat) return 0; return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for * conntracks which were associated with that device, * and forget them. */ nf_nat_masq_schedule(net, NULL, dev->ifindex, device_cmp, GFP_KERNEL); } return NOTIFY_DONE; } static int inet_cmp(struct nf_conn *ct, void *ptr) { struct nf_conntrack_tuple *tuple; struct masq_dev_work *w = ptr; if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct in_ifaddr *ifa = ptr; const struct in_device *idev; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; memset(&addr, 0, sizeof(addr)); addr.ip = ifa->ifa_address; dev = idev->dev; nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr) { #ifdef CONFIG_IPV6_MODULE const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (!v6_ops) return -EHOSTUNREACH; return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr); #else return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr); #endif } unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; struct nf_nat_range2 newrange; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); if (nat_ipv6_dev_get_saddr(nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &src) < 0) return NF_DROP; nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.in6 = src; newrange.max_addr.in6 = src; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. * * As we can have 'a lot' of inet_events (depending on amount of ipv6 * addresses being deleted), we also need to limit work item queue. */ static int masq_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; memset(&addr, 0, sizeof(addr)); addr.in6 = ifa->addr; nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, GFP_ATOMIC); return NOTIFY_DONE; } static struct notifier_block masq_inet6_notifier = { .notifier_call = masq_inet6_event, }; static int nf_nat_masquerade_ipv6_register_notifier(void) { return register_inet6addr_notifier(&masq_inet6_notifier); } #else static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; } #endif int nf_nat_masquerade_inet_register_notifiers(void) { int ret = 0; mutex_lock(&masq_mutex); if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) { ret = -EOVERFLOW; goto out_unlock; } /* check if the notifier was already set */ if (++masq_refcnt > 1) goto out_unlock; /* Register for device down reports */ ret = register_netdevice_notifier(&masq_dev_notifier); if (ret) goto err_dec; /* Register IP address change reports */ ret = register_inetaddr_notifier(&masq_inet_notifier); if (ret) goto err_unregister; ret = nf_nat_masquerade_ipv6_register_notifier(); if (ret) goto err_unreg_inet; mutex_unlock(&masq_mutex); return ret; err_unreg_inet: unregister_inetaddr_notifier(&masq_inet_notifier); err_unregister: unregister_netdevice_notifier(&masq_dev_notifier); err_dec: masq_refcnt--; out_unlock: mutex_unlock(&masq_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers); void nf_nat_masquerade_inet_unregister_notifiers(void) { mutex_lock(&masq_mutex); /* check if the notifiers still have clients */ if (--masq_refcnt > 0) goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&masq_inet6_notifier); #endif out_unlock: mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers);
9 2 7 47 47 47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 // SPDX-License-Identifier: GPL-2.0-only /* * x_tables core - Backend for {ip,ip6,arp}_tables * * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on existing ip_tables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/mutex.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_arp/arp_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 #define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) struct xt_template { struct list_head list; /* called when table is needed in the given netns */ int (*table_init)(struct net *net); struct module *me; /* A unique name... */ char name[XT_TABLE_MAXNAMELEN]; }; static struct list_head xt_templates[NFPROTO_NUMPROTO]; struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; }; struct compat_delta { unsigned int offset; /* offset in kernel */ int delta; /* delta in 32bit user land */ }; struct xt_af { struct mutex mutex; struct list_head match; struct list_head target; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct mutex compat_mutex; struct compat_delta *compat_tab; unsigned int number; /* number of slots in compat_tab[] */ unsigned int cur; /* number of used slots in compat_tab[] */ #endif }; static unsigned int xt_pernet_id __read_mostly; static struct xt_af *xt __read_mostly; static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_UNSPEC] = "x", [NFPROTO_IPV4] = "ip", [NFPROTO_ARP] = "arp", [NFPROTO_BRIDGE] = "eb", [NFPROTO_IPV6] = "ip6", }; /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { u_int8_t af = target->family; mutex_lock(&xt[af].mutex); list_add(&target->list, &xt[af].target); mutex_unlock(&xt[af].mutex); return 0; } EXPORT_SYMBOL(xt_register_target); void xt_unregister_target(struct xt_target *target) { u_int8_t af = target->family; mutex_lock(&xt[af].mutex); list_del(&target->list); mutex_unlock(&xt[af].mutex); } EXPORT_SYMBOL(xt_unregister_target); int xt_register_targets(struct xt_target *target, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = xt_register_target(&target[i]); if (err) goto err; } return err; err: if (i > 0) xt_unregister_targets(target, i); return err; } EXPORT_SYMBOL(xt_register_targets); void xt_unregister_targets(struct xt_target *target, unsigned int n) { while (n-- > 0) xt_unregister_target(&target[n]); } EXPORT_SYMBOL(xt_unregister_targets); int xt_register_match(struct xt_match *match) { u_int8_t af = match->family; mutex_lock(&xt[af].mutex); list_add(&match->list, &xt[af].match); mutex_unlock(&xt[af].mutex); return 0; } EXPORT_SYMBOL(xt_register_match); void xt_unregister_match(struct xt_match *match) { u_int8_t af = match->family; mutex_lock(&xt[af].mutex); list_del(&match->list); mutex_unlock(&xt[af].mutex); } EXPORT_SYMBOL(xt_unregister_match); int xt_register_matches(struct xt_match *match, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = xt_register_match(&match[i]); if (err) goto err; } return err; err: if (i > 0) xt_unregister_matches(match, i); return err; } EXPORT_SYMBOL(xt_register_matches); void xt_unregister_matches(struct xt_match *match, unsigned int n) { while (n-- > 0) xt_unregister_match(&match[n]); } EXPORT_SYMBOL(xt_unregister_matches); /* * These are weird, but module loading must not be done with mutex * held (since they will register), and we have to have a single * function to use. */ /* Find match, grabs ref. Returns ERR_PTR() on error. */ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) { struct xt_match *m; int err = -ENOENT; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision == revision) { if (try_module_get(m->me)) { mutex_unlock(&xt[af].mutex); return m; } } else err = -EPROTOTYPE; /* Found something. */ } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC) /* Try searching again in the family-independent list */ return xt_find_match(NFPROTO_UNSPEC, name, revision); return ERR_PTR(err); } EXPORT_SYMBOL(xt_find_match); struct xt_match * xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) { struct xt_match *match; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); match = xt_find_match(nfproto, name, revision); if (IS_ERR(match)) { request_module("%st_%s", xt_prefix[nfproto], name); match = xt_find_match(nfproto, name, revision); } return match; } EXPORT_SYMBOL_GPL(xt_request_find_match); /* Find target, grabs ref. Returns ERR_PTR() on error. */ static struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; int err = -ENOENT; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision == revision) { if (try_module_get(t->me)) { mutex_unlock(&xt[af].mutex); return t; } } else err = -EPROTOTYPE; /* Found something. */ } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC) /* Try searching again in the family-independent list */ return xt_find_target(NFPROTO_UNSPEC, name, revision); return ERR_PTR(err); } struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { struct xt_target *target; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); target = xt_find_target(af, name, revision); if (IS_ERR(target)) { request_module("%st_%s", xt_prefix[af], name); target = xt_find_target(af, name, revision); } return target; } EXPORT_SYMBOL_GPL(xt_request_find_target); static int xt_obj_to_user(u16 __user *psize, u16 size, void __user *pname, const char *name, u8 __user *prev, u8 rev) { if (put_user(size, psize)) return -EFAULT; if (copy_to_user(pname, name, strlen(name) + 1)) return -EFAULT; if (put_user(rev, prev)) return -EFAULT; return 0; } #define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE) \ xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size, \ U->u.user.name, K->u.kernel.TYPE->name, \ &U->u.user.revision, K->u.kernel.TYPE->revision) int xt_data_to_user(void __user *dst, const void *src, int usersize, int size, int aligned_size) { usersize = usersize ? : size; if (copy_to_user(dst, src, usersize)) return -EFAULT; if (usersize != aligned_size && clear_user(dst + usersize, aligned_size - usersize)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(xt_data_to_user); #define XT_DATA_TO_USER(U, K, TYPE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ K->u.kernel.TYPE->TYPE##size, \ XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u) { return XT_OBJ_TO_USER(u, m, match, 0) || XT_DATA_TO_USER(u, m, match); } EXPORT_SYMBOL_GPL(xt_match_to_user); int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u) { return XT_OBJ_TO_USER(u, t, target, 0) || XT_DATA_TO_USER(u, t, target); } EXPORT_SYMBOL_GPL(xt_target_to_user); static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_match *m; int have_rev = 0; mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision > *bestp) *bestp = m->revision; if (m->revision == revision) have_rev = 1; } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); return have_rev; } static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_target *t; int have_rev = 0; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision > *bestp) *bestp = t->revision; if (t->revision == revision) have_rev = 1; } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); return have_rev; } /* Returns true or false (if no such extension at all) */ int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err) { int have_rev, best = -1; if (target == 1) have_rev = target_revfn(af, name, revision, &best); else have_rev = match_revfn(af, name, revision, &best); /* Nothing at all? Return 0 to try loading module. */ if (best == -1) { *err = -ENOENT; return 0; } *err = best; if (!have_rev) *err = -EPROTONOSUPPORT; return 1; } EXPORT_SYMBOL_GPL(xt_find_revision); static char * textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto) { static const char *const inetbr_names[] = { "PREROUTING", "INPUT", "FORWARD", "OUTPUT", "POSTROUTING", "BROUTING", }; static const char *const arp_names[] = { "INPUT", "FORWARD", "OUTPUT", }; const char *const *names; unsigned int i, max; char *p = buf; bool np = false; int res; names = (nfproto == NFPROTO_ARP) ? arp_names : inetbr_names; max = (nfproto == NFPROTO_ARP) ? ARRAY_SIZE(arp_names) : ARRAY_SIZE(inetbr_names); *p = '\0'; for (i = 0; i < max; ++i) { if (!(mask & (1 << i))) continue; res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]); if (res > 0) { size -= res; p += res; } np = true; } return buf; } /** * xt_check_proc_name - check that name is suitable for /proc file creation * * @name: file name candidate * @size: length of buffer * * some x_tables modules wish to create a file in /proc. * This function makes sure that the name is suitable for this * purpose, it checks that name is NUL terminated and isn't a 'special' * name, like "..". * * returns negative number on error or 0 if name is useable. */ int xt_check_proc_name(const char *name, unsigned int size) { if (name[0] == '\0') return -EINVAL; if (strnlen(name, size) == size) return -ENAMETOOLONG; if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0 || strchr(name, '/')) return -EINVAL; return 0; } EXPORT_SYMBOL(xt_check_proc_name); int xt_check_match(struct xt_mtchk_param *par, unsigned int size, u16 proto, bool inv_proto) { int ret; if (XT_ALIGN(par->match->matchsize) != size && par->match->matchsize != -1) { /* * ebt_among is exempt from centralized matchsize checking * because it uses a dynamic-size data set. */ pr_err_ratelimited("%s_tables: %s.%u match: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->match->name, par->match->revision, XT_ALIGN(par->match->matchsize), size); return -EINVAL; } if (par->match->table != NULL && strcmp(par->match->table, par->table) != 0) { pr_info_ratelimited("%s_tables: %s match: only valid in %s table, not %s\n", xt_prefix[par->family], par->match->name, par->match->table, par->table); return -EINVAL; } if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { char used[64], allow[64]; pr_info_ratelimited("%s_tables: %s match: used from hooks %s, but only valid from %s\n", xt_prefix[par->family], par->match->name, textify_hooks(used, sizeof(used), par->hook_mask, par->family), textify_hooks(allow, sizeof(allow), par->match->hooks, par->family)); return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { pr_info_ratelimited("%s_tables: %s match: only valid for protocol %u\n", xt_prefix[par->family], par->match->name, par->match->proto); return -EINVAL; } if (par->match->checkentry != NULL) { ret = par->match->checkentry(par); if (ret < 0) return ret; else if (ret > 0) /* Flag up potential errors. */ return -EIO; } return 0; } EXPORT_SYMBOL_GPL(xt_check_match); /** xt_check_entry_match - check that matches end before start of target * * @match: beginning of xt_entry_match * @target: beginning of this rules target (alleged end of matches) * @alignment: alignment requirement of match structures * * Validates that all matches add up to the beginning of the target, * and that each match covers at least the base structure size. * * Return: 0 on success, negative errno on failure. */ static int xt_check_entry_match(const char *match, const char *target, const size_t alignment) { const struct xt_entry_match *pos; int length = target - match; if (length == 0) /* no matches */ return 0; pos = (struct xt_entry_match *)match; do { if ((unsigned long)pos % alignment) return -EINVAL; if (length < (int)sizeof(struct xt_entry_match)) return -EINVAL; if (pos->u.match_size < sizeof(struct xt_entry_match)) return -EINVAL; if (pos->u.match_size > length) return -EINVAL; length -= pos->u.match_size; pos = ((void *)((char *)(pos) + (pos)->u.match_size)); } while (length > 0); return 0; } /** xt_check_table_hooks - check hook entry points are sane * * @info xt_table_info to check * @valid_hooks - hook entry points that we can enter from * * Validates that the hook entry and underflows points are set up. * * Return: 0 on success, negative errno on failure. */ int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks) { const char *err = "unsorted underflow"; unsigned int i, max_uflow, max_entry; bool check_hooks = false; BUILD_BUG_ON(ARRAY_SIZE(info->hook_entry) != ARRAY_SIZE(info->underflow)); max_entry = 0; max_uflow = 0; for (i = 0; i < ARRAY_SIZE(info->hook_entry); i++) { if (!(valid_hooks & (1 << i))) continue; if (info->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; if (info->underflow[i] == 0xFFFFFFFF) return -EINVAL; if (check_hooks) { if (max_uflow > info->underflow[i]) goto error; if (max_uflow == info->underflow[i]) { err = "duplicate underflow"; goto error; } if (max_entry > info->hook_entry[i]) { err = "unsorted entry"; goto error; } if (max_entry == info->hook_entry[i]) { err = "duplicate entry"; goto error; } } max_entry = info->hook_entry[i]; max_uflow = info->underflow[i]; check_hooks = true; } return 0; error: pr_err_ratelimited("%s at hook %d\n", err, i); return -EINVAL; } EXPORT_SYMBOL(xt_check_table_hooks); static bool verdict_ok(int verdict) { if (verdict > 0) return true; if (verdict < 0) { int v = -verdict - 1; if (verdict == XT_RETURN) return true; switch (v) { case NF_ACCEPT: return true; case NF_DROP: return true; case NF_QUEUE: return true; default: break; } return false; } return false; } static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, const char *msg, unsigned int msglen) { return usersize == kernsize && strnlen(msg, msglen) < msglen; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); if (WARN_ON(!xp->compat_tab)) return -ENOMEM; if (xp->cur >= xp->number) return -EINVAL; if (xp->cur) delta += xp->compat_tab[xp->cur - 1].delta; xp->compat_tab[xp->cur].offset = offset; xp->compat_tab[xp->cur].delta = delta; xp->cur++; return 0; } EXPORT_SYMBOL_GPL(xt_compat_add_offset); void xt_compat_flush_offsets(u_int8_t af) { WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); if (xt[af].compat_tab) { vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; xt[af].number = 0; xt[af].cur = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); int xt_compat_calc_jump(u_int8_t af, unsigned int offset) { struct compat_delta *tmp = xt[af].compat_tab; int mid, left = 0, right = xt[af].cur - 1; while (left <= right) { mid = (left + right) >> 1; if (offset > tmp[mid].offset) left = mid + 1; else if (offset < tmp[mid].offset) right = mid - 1; else return mid ? tmp[mid - 1].delta : 0; } return left ? tmp[left - 1].delta : 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); int xt_compat_init_offsets(u8 af, unsigned int number) { size_t mem; WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); if (!number || number > (INT_MAX / sizeof(struct compat_delta))) return -EINVAL; if (WARN_ON(xt[af].compat_tab)) return -EINVAL; mem = sizeof(struct compat_delta) * number; if (mem > XT_MAX_TABLE_SIZE) return -ENOMEM; xt[af].compat_tab = vmalloc(mem); if (!xt[af].compat_tab) return -ENOMEM; xt[af].number = number; xt[af].cur = 0; return 0; } EXPORT_SYMBOL(xt_compat_init_offsets); int xt_compat_match_offset(const struct xt_match *match) { u_int16_t csize = match->compatsize ? : match->matchsize; return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize); } EXPORT_SYMBOL_GPL(xt_compat_match_offset); void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, unsigned int *size) { const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; int off = xt_compat_match_offset(match); u_int16_t msize = cm->u.user.match_size; char name[sizeof(m->u.user.name)]; m = *dstptr; memcpy(m, cm, sizeof(*cm)); if (match->compat_from_user) match->compat_from_user(m->data, cm->data); else memcpy(m->data, cm->data, msize - sizeof(*cm)); msize += off; m->u.user.match_size = msize; strlcpy(name, match->name, sizeof(name)); module_put(match->me); strncpy(m->u.user.name, name, sizeof(m->u.user.name)); *size += off; *dstptr += msize; } EXPORT_SYMBOL_GPL(xt_compat_match_from_user); #define COMPAT_XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ C_SIZE, \ COMPAT_XT_ALIGN(C_SIZE)) int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size) { const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match __user *cm = *dstptr; int off = xt_compat_match_offset(match); u_int16_t msize = m->u.user.match_size - off; if (XT_OBJ_TO_USER(cm, m, match, msize)) return -EFAULT; if (match->compat_to_user) { if (match->compat_to_user((void __user *)cm->data, m->data)) return -EFAULT; } else { if (COMPAT_XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) return -EFAULT; } *size -= off; *dstptr += msize; return 0; } EXPORT_SYMBOL_GPL(xt_compat_match_to_user); /* non-compat version may have padding after verdict */ struct compat_xt_standard_target { struct compat_xt_entry_target t; compat_uint_t verdict; }; struct compat_xt_error_target { struct compat_xt_entry_target t; char errorname[XT_FUNCTION_MAXNAMELEN]; }; int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) { long size_of_base_struct = elems - (const char *)base; const struct compat_xt_entry_target *t; const char *e = base; if (target_offset < size_of_base_struct) return -EINVAL; if (target_offset + sizeof(*t) > next_offset) return -EINVAL; t = (void *)(e + target_offset); if (t->u.target_size < sizeof(*t)) return -EINVAL; if (target_offset + t->u.target_size > next_offset) return -EINVAL; if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { const struct compat_xt_standard_target *st = (const void *)t; if (COMPAT_XT_ALIGN(target_offset + sizeof(*st)) != next_offset) return -EINVAL; if (!verdict_ok(st->verdict)) return -EINVAL; } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { const struct compat_xt_error_target *et = (const void *)t; if (!error_tg_ok(t->u.target_size, sizeof(*et), et->errorname, sizeof(et->errorname))) return -EINVAL; } /* compat_xt_entry match has less strict alignment requirements, * otherwise they are identical. In case of padding differences * we need to add compat version of xt_check_entry_match. */ BUILD_BUG_ON(sizeof(struct compat_xt_entry_match) != sizeof(struct xt_entry_match)); return xt_check_entry_match(elems, base + target_offset, __alignof__(struct compat_xt_entry_match)); } EXPORT_SYMBOL(xt_compat_check_entry_offsets); #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ /** * xt_check_entry_offsets - validate arp/ip/ip6t_entry * * @base: pointer to arp/ip/ip6t_entry * @elems: pointer to first xt_entry_match, i.e. ip(6)t_entry->elems * @target_offset: the arp/ip/ip6_t->target_offset * @next_offset: the arp/ip/ip6_t->next_offset * * validates that target_offset and next_offset are sane and that all * match sizes (if any) align with the target offset. * * This function does not validate the targets or matches themselves, it * only tests that all the offsets and sizes are correct, that all * match structures are aligned, and that the last structure ends where * the target structure begins. * * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version. * * The arp/ip/ip6t_entry structure @base must have passed following tests: * - it must point to a valid memory location * - base to base + next_offset must be accessible, i.e. not exceed allocated * length. * * A well-formed entry looks like this: * * ip(6)t_entry match [mtdata] match [mtdata] target [tgdata] ip(6)t_entry * e->elems[]-----' | | * matchsize | | * matchsize | | * | | * target_offset---------------------------------' | * next_offset---------------------------------------------------' * * elems[]: flexible array member at end of ip(6)/arpt_entry struct. * This is where matches (if any) and the target reside. * target_offset: beginning of target. * next_offset: start of the next rule; also: size of this rule. * Since targets have a minimum size, target_offset + minlen <= next_offset. * * Every match stores its size, sum of sizes must not exceed target_offset. * * Return: 0 on success, negative errno on failure. */ int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) { long size_of_base_struct = elems - (const char *)base; const struct xt_entry_target *t; const char *e = base; /* target start is within the ip/ip6/arpt_entry struct */ if (target_offset < size_of_base_struct) return -EINVAL; if (target_offset + sizeof(*t) > next_offset) return -EINVAL; t = (void *)(e + target_offset); if (t->u.target_size < sizeof(*t)) return -EINVAL; if (target_offset + t->u.target_size > next_offset) return -EINVAL; if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { const struct xt_standard_target *st = (const void *)t; if (XT_ALIGN(target_offset + sizeof(*st)) != next_offset) return -EINVAL; if (!verdict_ok(st->verdict)) return -EINVAL; } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { const struct xt_error_target *et = (const void *)t; if (!error_tg_ok(t->u.target_size, sizeof(*et), et->errorname, sizeof(et->errorname))) return -EINVAL; } return xt_check_entry_match(elems, base + target_offset, __alignof__(struct xt_entry_match)); } EXPORT_SYMBOL(xt_check_entry_offsets); /** * xt_alloc_entry_offsets - allocate array to store rule head offsets * * @size: number of entries * * Return: NULL or zeroed kmalloc'd or vmalloc'd array */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int)) return NULL; return kvcalloc(size, sizeof(unsigned int), GFP_KERNEL); } EXPORT_SYMBOL(xt_alloc_entry_offsets); /** * xt_find_jump_offset - check if target is a valid jump offset * * @offsets: array containing all valid rule start offsets of a rule blob * @target: the jump target to search for * @size: entries in @offset */ bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size) { int m, low = 0, hi = size; while (hi > low) { m = (low + hi) / 2u; if (offsets[m] > target) hi = m; else if (offsets[m] < target) low = m + 1; else return true; } return false; } EXPORT_SYMBOL(xt_find_jump_offset); int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u16 proto, bool inv_proto) { int ret; if (XT_ALIGN(par->target->targetsize) != size) { pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->target->name, par->target->revision, XT_ALIGN(par->target->targetsize), size); return -EINVAL; } if (par->target->table != NULL && strcmp(par->target->table, par->table) != 0) { pr_info_ratelimited("%s_tables: %s target: only valid in %s table, not %s\n", xt_prefix[par->family], par->target->name, par->target->table, par->table); return -EINVAL; } if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { char used[64], allow[64]; pr_info_ratelimited("%s_tables: %s target: used from hooks %s, but only usable from %s\n", xt_prefix[par->family], par->target->name, textify_hooks(used, sizeof(used), par->hook_mask, par->family), textify_hooks(allow, sizeof(allow), par->target->hooks, par->family)); return -EINVAL; } if (par->target->proto && (par->target->proto != proto || inv_proto)) { pr_info_ratelimited("%s_tables: %s target: only valid for protocol %u\n", xt_prefix[par->family], par->target->name, par->target->proto); return -EINVAL; } if (par->target->checkentry != NULL) { ret = par->target->checkentry(par); if (ret < 0) return ret; else if (ret > 0) /* Flag up potential errors. */ return -EIO; } return 0; } EXPORT_SYMBOL_GPL(xt_check_target); /** * xt_copy_counters - copy counters and metadata from a sockptr_t * * @arg: src sockptr * @len: alleged size of userspace memory * @info: where to store the xt_counters_info metadata * * Copies counter meta data from @user and stores it in @info. * * vmallocs memory to hold the counters, then copies the counter data * from @user to the new memory and returns a pointer to it. * * If called from a compat syscall, @info gets converted automatically to the * 64bit representation. * * The metadata associated with the counters is stored in @info. * * Return: returns pointer that caller has to test via IS_ERR(). * If IS_ERR is false, caller has to vfree the pointer. */ void *xt_copy_counters(sockptr_t arg, unsigned int len, struct xt_counters_info *info) { size_t offset; void *mem; u64 size; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) { /* structures only differ in size due to alignment */ struct compat_xt_counters_info compat_tmp; if (len <= sizeof(compat_tmp)) return ERR_PTR(-EINVAL); len -= sizeof(compat_tmp); if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; offset = sizeof(compat_tmp); } else #endif { if (len <= sizeof(*info)) return ERR_PTR(-EINVAL); len -= sizeof(*info); if (copy_from_sockptr(info, arg, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); offset = sizeof(*info); } info->name[sizeof(info->name) - 1] = '\0'; size = sizeof(struct xt_counters); size *= info->num_counters; if (size != (u64)len) return ERR_PTR(-EINVAL); mem = vmalloc(len); if (!mem) return ERR_PTR(-ENOMEM); if (copy_from_sockptr_offset(mem, arg, offset, len) == 0) return mem; vfree(mem); return ERR_PTR(-EFAULT); } EXPORT_SYMBOL_GPL(xt_copy_counters); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_target_offset(const struct xt_target *target) { u_int16_t csize = target->compatsize ? : target->targetsize; return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize); } EXPORT_SYMBOL_GPL(xt_compat_target_offset); void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, unsigned int *size) { const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; int off = xt_compat_target_offset(target); u_int16_t tsize = ct->u.user.target_size; char name[sizeof(t->u.user.name)]; t = *dstptr; memcpy(t, ct, sizeof(*ct)); if (target->compat_from_user) target->compat_from_user(t->data, ct->data); else memcpy(t->data, ct->data, tsize - sizeof(*ct)); tsize += off; t->u.user.target_size = tsize; strlcpy(name, target->name, sizeof(name)); module_put(target->me); strncpy(t->u.user.name, name, sizeof(t->u.user.name)); *size += off; *dstptr += tsize; } EXPORT_SYMBOL_GPL(xt_compat_target_from_user); int xt_compat_target_to_user(const struct xt_entry_target *t, void __user **dstptr, unsigned int *size) { const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target __user *ct = *dstptr; int off = xt_compat_target_offset(target); u_int16_t tsize = t->u.user.target_size - off; if (XT_OBJ_TO_USER(ct, t, target, tsize)) return -EFAULT; if (target->compat_to_user) { if (target->compat_to_user((void __user *)ct->data, t->data)) return -EFAULT; } else { if (COMPAT_XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) return -EFAULT; } *size -= off; *dstptr += tsize; return 0; } EXPORT_SYMBOL_GPL(xt_compat_target_to_user); #endif struct xt_table_info *xt_alloc_table_info(unsigned int size) { struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE) return NULL; info = kvmalloc(sz, GFP_KERNEL_ACCOUNT); if (!info) return NULL; memset(info, 0, sizeof(*info)); info->size = size; return info; } EXPORT_SYMBOL(xt_alloc_table_info); void xt_free_table_info(struct xt_table_info *info) { int cpu; if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) kvfree(info->jumpstack[cpu]); kvfree(info->jumpstack); } kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); struct xt_table *xt_find_table(struct net *net, u8 af, const char *name) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct xt_table *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) { if (strcmp(t->name, name) == 0) { mutex_unlock(&xt[af].mutex); return t; } } mutex_unlock(&xt[af].mutex); return NULL; } EXPORT_SYMBOL(xt_find_table); /* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct module *owner = NULL; struct xt_template *tmpl; struct xt_table *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; /* Table doesn't exist in this netns, check larval list */ list_for_each_entry(tmpl, &xt_templates[af], list) { int err; if (strcmp(tmpl->name, name)) continue; if (!try_module_get(tmpl->me)) goto out; owner = tmpl->me; mutex_unlock(&xt[af].mutex); err = tmpl->table_init(net); if (err < 0) { module_put(owner); return ERR_PTR(err); } mutex_lock(&xt[af].mutex); break; } /* and once again: */ list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && owner == t->me) return t; module_put(owner); out: mutex_unlock(&xt[af].mutex); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(xt_find_table_lock); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, const char *name) { struct xt_table *t = xt_find_table_lock(net, af, name); #ifdef CONFIG_MODULES if (IS_ERR(t)) { int err = request_module("%stable_%s", xt_prefix[af], name); if (err < 0) return ERR_PTR(err); t = xt_find_table_lock(net, af, name); } #endif return t; } EXPORT_SYMBOL_GPL(xt_request_find_table_lock); void xt_table_unlock(struct xt_table *table) { mutex_unlock(&xt[table->af].mutex); } EXPORT_SYMBOL_GPL(xt_table_unlock); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_lock); void xt_compat_unlock(u_int8_t af) { mutex_unlock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif DEFINE_PER_CPU(seqcount_t, xt_recseq); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); struct static_key xt_tee_enabled __read_mostly; EXPORT_SYMBOL_GPL(xt_tee_enabled); static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; int cpu; size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) i->jumpstack = kvzalloc(size, GFP_KERNEL); else i->jumpstack = kzalloc(size, GFP_KERNEL); if (i->jumpstack == NULL) return -ENOMEM; /* ruleset without jumps -- no stack needed */ if (i->stacksize == 0) return 0; /* Jumpstack needs to be able to record two full callchains, one * from the first rule set traversal, plus one table reentrancy * via -j TEE without clobbering the callchain that brought us to * TEE target. * * This is done by allocating two jumpstacks per cpu, on reentry * the upper half of the stack is used. * * see the jumpstack setup in ipt_do_table() for more details. */ size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { i->jumpstack[cpu] = kvmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (i->jumpstack[cpu] == NULL) /* * Freeing will be done later on by the callers. The * chain is: xt_replace_table -> __do_replace -> * do_replace -> xt_free_table_info. */ return -ENOMEM; } return 0; } struct xt_counters *xt_counters_alloc(unsigned int counters) { struct xt_counters *mem; if (counters == 0 || counters > INT_MAX / sizeof(*mem)) return NULL; counters *= sizeof(*mem); if (counters > XT_MAX_TABLE_SIZE) return NULL; return vzalloc(counters); } EXPORT_SYMBOL(xt_counters_alloc); struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error) { struct xt_table_info *private; unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); if (ret < 0) { *error = ret; return NULL; } /* Do the substitution. */ local_bh_disable(); private = table->private; /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { pr_debug("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); local_bh_enable(); *error = -EAGAIN; return NULL; } newinfo->initial_entries = private->initial_entries; /* * Ensure contents of newinfo are visible before assigning to * private. */ smp_wmb(); table->private = newinfo; /* make sure all cpus see new ->private value */ smp_mb(); /* * Even though table entries have now been swapped, other CPU's * may still be using the old entries... */ local_bh_enable(); /* ... so wait for even xt_recseq on all cpus */ for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); u32 seq = raw_read_seqcount(s); if (seq & 1) { do { cond_resched(); cpu_relax(); } while (seq == raw_read_seqcount(s)); } } audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : AUDIT_XT_OP_REPLACE, GFP_KERNEL); return private; } EXPORT_SYMBOL_GPL(xt_replace_table); struct xt_table *xt_register_table(struct net *net, const struct xt_table *input_table, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct xt_table_info *private; struct xt_table *t, *table; int ret; /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); if (!table) { ret = -ENOMEM; goto out; } mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ list_for_each_entry(t, &xt_net->tables[table->af], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto unlock; } } /* Simplifies replace_table code. */ table->private = bootstrap; if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; pr_debug("table->private->number = %u\n", private->number); /* save number of initial entries */ private->initial_entries = private->number; list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); kfree(table); out: return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); void *xt_unregister_table(struct xt_table *table) { struct xt_table_info *private; mutex_lock(&xt[table->af].mutex); private = table->private; list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); kfree(table->ops); kfree(table); return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; xt_net = net_generic(net, xt_pernet_id); mutex_lock(&xt[af].mutex); return seq_list_start(&xt_net->tables[af], *pos); } static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; xt_net = net_generic(net, xt_pernet_id); return seq_list_next(v, &xt_net->tables[af], pos); } static void xt_table_seq_stop(struct seq_file *seq, void *v) { u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); mutex_unlock(&xt[af].mutex); } static int xt_table_seq_show(struct seq_file *seq, void *v) { struct xt_table *table = list_entry(v, struct xt_table, list); if (*table->name) seq_printf(seq, "%s\n", table->name); return 0; } static const struct seq_operations xt_table_seq_ops = { .start = xt_table_seq_start, .next = xt_table_seq_next, .stop = xt_table_seq_stop, .show = xt_table_seq_show, }; /* * Traverse state for ip{,6}_{tables,matches} for helping crossing * the multi-AF mutexes. */ struct nf_mttg_trav { struct list_head *head, *curr; uint8_t class; }; enum { MTTG_TRAV_INIT, MTTG_TRAV_NFP_UNSPEC, MTTG_TRAV_NFP_SPEC, MTTG_TRAV_DONE, }; static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, bool is_target) { static const uint8_t next_class[] = { [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, }; uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; if (ppos != NULL) ++(*ppos); switch (trav->class) { case MTTG_TRAV_INIT: trav->class = MTTG_TRAV_NFP_UNSPEC; mutex_lock(&xt[NFPROTO_UNSPEC].mutex); trav->head = trav->curr = is_target ? &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match; break; case MTTG_TRAV_NFP_UNSPEC: trav->curr = trav->curr->next; if (trav->curr != trav->head) break; mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); mutex_lock(&xt[nfproto].mutex); trav->head = trav->curr = is_target ? &xt[nfproto].target : &xt[nfproto].match; trav->class = next_class[trav->class]; break; case MTTG_TRAV_NFP_SPEC: trav->curr = trav->curr->next; if (trav->curr != trav->head) break; fallthrough; default: return NULL; } return trav; } static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, bool is_target) { struct nf_mttg_trav *trav = seq->private; unsigned int j; trav->class = MTTG_TRAV_INIT; for (j = 0; j < *pos; ++j) if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL) return NULL; return trav; } static void xt_mttg_seq_stop(struct seq_file *seq, void *v) { uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; switch (trav->class) { case MTTG_TRAV_NFP_UNSPEC: mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); break; case MTTG_TRAV_NFP_SPEC: mutex_unlock(&xt[nfproto].mutex); break; } } static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) { return xt_mttg_seq_start(seq, pos, false); } static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { return xt_mttg_seq_next(seq, v, ppos, false); } static int xt_match_seq_show(struct seq_file *seq, void *v) { const struct nf_mttg_trav *trav = seq->private; const struct xt_match *match; switch (trav->class) { case MTTG_TRAV_NFP_UNSPEC: case MTTG_TRAV_NFP_SPEC: if (trav->curr == trav->head) return 0; match = list_entry(trav->curr, struct xt_match, list); if (*match->name) seq_printf(seq, "%s\n", match->name); } return 0; } static const struct seq_operations xt_match_seq_ops = { .start = xt_match_seq_start, .next = xt_match_seq_next, .stop = xt_mttg_seq_stop, .show = xt_match_seq_show, }; static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos) { return xt_mttg_seq_start(seq, pos, true); } static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { return xt_mttg_seq_next(seq, v, ppos, true); } static int xt_target_seq_show(struct seq_file *seq, void *v) { const struct nf_mttg_trav *trav = seq->private; const struct xt_target *target; switch (trav->class) { case MTTG_TRAV_NFP_UNSPEC: case MTTG_TRAV_NFP_SPEC: if (trav->curr == trav->head) return 0; target = list_entry(trav->curr, struct xt_target, list); if (*target->name) seq_printf(seq, "%s\n", target->name); } return 0; } static const struct seq_operations xt_target_seq_ops = { .start = xt_target_seq_start, .next = xt_target_seq_next, .stop = xt_mttg_seq_stop, .show = xt_target_seq_show, }; #define FORMAT_TABLES "_tables_names" #define FORMAT_MATCHES "_tables_matches" #define FORMAT_TARGETS "_tables_targets" #endif /* CONFIG_PROC_FS */ /** * xt_hook_ops_alloc - set up hooks for a new table * @table: table with metadata needed to set up hooks * @fn: Hook function * * This function will create the nf_hook_ops that the x_table needs * to hand to xt_hook_link_net(). */ struct nf_hook_ops * xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) { unsigned int hook_mask = table->valid_hooks; uint8_t i, num_hooks = hweight32(hook_mask); uint8_t hooknum; struct nf_hook_ops *ops; if (!num_hooks) return ERR_PTR(-EINVAL); ops = kcalloc(num_hooks, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0; hook_mask >>= 1, ++hooknum) { if (!(hook_mask & 1)) continue; ops[i].hook = fn; ops[i].pf = table->af; ops[i].hooknum = hooknum; ops[i].priority = table->priority; ++i; } return ops; } EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); int xt_register_template(const struct xt_table *table, int (*table_init)(struct net *net)) { int ret = -EEXIST, af = table->af; struct xt_template *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_templates[af], list) { if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0)) goto out_unlock; } ret = -ENOMEM; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out_unlock; BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name)); strscpy(t->name, table->name, sizeof(t->name)); t->table_init = table_init; t->me = table->me; list_add(&t->list, &xt_templates[af]); ret = 0; out_unlock: mutex_unlock(&xt[af].mutex); return ret; } EXPORT_SYMBOL_GPL(xt_register_template); void xt_unregister_template(const struct xt_table *table) { struct xt_template *t; int af = table->af; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_templates[af], list) { if (strcmp(table->name, t->name)) continue; list_del(&t->list); mutex_unlock(&xt[af].mutex); kfree(t); return; } mutex_unlock(&xt[af].mutex); WARN_ON_ONCE(1); } EXPORT_SYMBOL_GPL(xt_unregister_template); int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; #endif if (af >= ARRAY_SIZE(xt_prefix)) return -EINVAL; #ifdef CONFIG_PROC_FS root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops, sizeof(struct seq_net_private), (void *)(unsigned long)af); if (!proc) goto out; if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_match_seq_ops, sizeof(struct nf_mttg_trav), (void *)(unsigned long)af); if (!proc) goto out_remove_tables; if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_target_seq_ops, sizeof(struct nf_mttg_trav), (void *)(unsigned long)af); if (!proc) goto out_remove_matches; if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif return 0; #ifdef CONFIG_PROC_FS out_remove_matches: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out_remove_tables: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out: return -1; #endif } EXPORT_SYMBOL_GPL(xt_proto_init); void xt_proto_fini(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); #endif /*CONFIG_PROC_FS*/ } EXPORT_SYMBOL_GPL(xt_proto_fini); /** * xt_percpu_counter_alloc - allocate x_tables rule counter * * @state: pointer to xt_percpu allocation state * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct * * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then * contain the address of the real (percpu) counter. * * Rule evaluation needs to use xt_get_this_cpu_counter() helper * to fetch the real percpu counter. * * To speed up allocation and improve data locality, a 4kb block is * allocated. Freeing any counter may free an entire block, so all * counters allocated using the same state must be freed at the same * time. * * xt_percpu_counter_alloc_state contains the base address of the * allocated page and the current sub-offset. * * returns false on error. */ bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state, struct xt_counters *counter) { BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2)); if (nr_cpu_ids <= 1) return true; if (!state->mem) { state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE, XT_PCPU_BLOCK_SIZE); if (!state->mem) return false; } counter->pcnt = (__force unsigned long)(state->mem + state->off); state->off += sizeof(*counter); if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) { state->mem = NULL; state->off = 0; } return true; } EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc); void xt_percpu_counter_free(struct xt_counters *counters) { unsigned long pcnt = counters->pcnt; if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0) free_percpu((void __percpu *)pcnt); } EXPORT_SYMBOL_GPL(xt_percpu_counter_free); static int __net_init xt_net_init(struct net *net) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) INIT_LIST_HEAD(&xt_net->tables[i]); return 0; } static void __net_exit xt_net_exit(struct net *net) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); } static struct pernet_operations xt_net_ops = { .init = xt_net_init, .exit = xt_net_exit, .id = &xt_pernet_id, .size = sizeof(struct xt_pernet), }; static int __init xt_init(void) { unsigned int i; int rv; for_each_possible_cpu(i) { seqcount_init(&per_cpu(xt_recseq, i)); } xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); if (!xt) return -ENOMEM; for (i = 0; i < NFPROTO_NUMPROTO; i++) { mutex_init(&xt[i].mutex); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT mutex_init(&xt[i].compat_mutex); xt[i].compat_tab = NULL; #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); INIT_LIST_HEAD(&xt_templates[i]); } rv = register_pernet_subsys(&xt_net_ops); if (rv < 0) kfree(xt); return rv; } static void __exit xt_fini(void) { unregister_pernet_subsys(&xt_net_ops); kfree(xt); } module_init(xt_init); module_exit(xt_fini);
4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/block_validity.c * * Copyright (C) 2009 * Theodore Ts'o (tytso@mit.edu) * * Track which blocks in the filesystem are metadata blocks that * should never be used as data blocks by files or directories. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/slab.h> #include "ext4.h" struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; u32 ino; }; static struct kmem_cache *ext4_system_zone_cachep; int __init ext4_init_system_zone(void) { ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); if (ext4_system_zone_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_system_zone(void) { rcu_barrier(); kmem_cache_destroy(ext4_system_zone_cachep); } static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { if ((entry1->start_blk + entry1->count) == entry2->start_blk && entry1->ino == entry2->ino) return 1; return 0; } static void release_system_zone(struct ext4_system_blocks *system_blks) { struct ext4_system_zone *entry, *n; rbtree_postorder_for_each_entry_safe(entry, n, &system_blks->root, node) kmem_cache_free(ext4_system_zone_cachep, entry); } /* * Mark a range of blocks as belonging to the "system zone" --- that * is, filesystem metadata blocks which should never be used by * inodes. */ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count, u32 ino) { struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_system_zone, node); if (start_blk < entry->start_blk) n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else /* Unexpected overlap of system zones. */ return -EFSCORRUPTED; } new_entry = kmem_cache_alloc(ext4_system_zone_cachep, GFP_KERNEL); if (!new_entry) return -ENOMEM; new_entry->start_blk = start_blk; new_entry->count = count; new_entry->ino = ino; new_node = &new_entry->node; rb_link_node(new_node, parent, n); rb_insert_color(new_node, &system_blks->root); /* Can we merge to the left? */ node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(entry, new_entry)) { new_entry->start_blk = entry->start_blk; new_entry->count += entry->count; rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } /* Can we merge to the right? */ node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(new_entry, entry)) { new_entry->count += entry->count; rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } return 0; } static void debug_print_tree(struct ext4_sb_info *sbi) { struct rb_node *node; struct ext4_system_zone *entry; struct ext4_system_blocks *system_blks; int first = 1; printk(KERN_INFO "System zones: "); rcu_read_lock(); system_blks = rcu_dereference(sbi->s_system_blks); node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", entry->start_blk, entry->start_blk + entry->count - 1); first = 0; node = rb_next(node); } rcu_read_unlock(); printk(KERN_CONT "\n"); } static int ext4_protect_reserved_inode(struct super_block *sb, struct ext4_system_blocks *system_blks, u32 ino) { struct inode *inode; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_map_blocks map; u32 i = 0, num; int err = 0, n; if ((ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(sbi->s_es->s_inodes_count))) return -EINVAL; inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) return PTR_ERR(inode); num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; while (i < num) { cond_resched(); map.m_lblk = i; map.m_len = num - i; n = ext4_map_blocks(NULL, inode, &map, 0); if (n < 0) { err = n; break; } if (n == 0) { i++; } else { err = add_system_zone(system_blks, map.m_pblk, n, ino); if (err < 0) { if (err == -EFSCORRUPTED) { EXT4_ERROR_INODE_ERR(inode, -err, "blocks %llu-%llu from inode overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1); } break; } i += n; } } iput(inode); return err; } static void ext4_destroy_system_zone(struct rcu_head *rcu) { struct ext4_system_blocks *system_blks; system_blks = container_of(rcu, struct ext4_system_blocks, rcu); release_system_zone(system_blks); kfree(system_blks); } /* * Build system zone rbtree which is used for block validity checking. * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. That's why we first build the rbtree and then * swap it in place. */ int ext4_setup_system_zone(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); if (!system_blks) return -ENOMEM; for (i=0; i < ngroups; i++) { unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); cond_resched(); if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), meta_blks, 0); if (ret) goto err; } gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, ext4_block_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_table(sb, gdp), sbi->s_itb_per_group, 0); if (ret) goto err; } if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { ret = ext4_protect_reserved_inode(sb, system_blks, le32_to_cpu(sbi->s_es->s_journal_inum)); if (ret) goto err; } /* * System blks rbtree complete, announce it once to prevent racing * with ext4_inode_block_valid() accessing the rbtree at the same * time. */ rcu_assign_pointer(sbi->s_system_blks, system_blks); if (test_opt(sb, DEBUG)) debug_print_tree(sbi); return 0; err: release_system_zone(system_blks); kfree(system_blks); return ret; } /* * Called when the filesystem is unmounted or when remounting it with * noblock_validity specified. * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. So we first clear the system_blks pointer and * then free the rbtree only after RCU grace period expires. */ void ext4_release_system_zone(struct super_block *sb) { struct ext4_system_blocks *system_blks; system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, lockdep_is_held(&sb->s_umount)); rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); if (system_blks) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_system_blocks *system_blks; struct ext4_system_zone *entry; struct rb_node *n; int ret = 1; if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; /* * Lock the system zone to prevent it being released concurrently * when doing a remount which inverse current "[no]block_validity" * mount option. */ rcu_read_lock(); system_blks = rcu_dereference(sbi->s_system_blks); if (system_blks == NULL) goto out_rcu; n = system_blks->root.rb_node; while (n) { entry = rb_entry(n, struct ext4_system_zone, node); if (start_blk + count - 1 < entry->start_blk) n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; else { ret = 0; if (inode) ret = (entry->ino == inode->i_ino); break; } } out_rcu: rcu_read_unlock(); return ret; } /* * Returns 1 if the passed-in block region (start_blk, * start_blk+count) is valid; 0 if some part of the block region * overlaps with some other filesystem metadata blocks. */ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count); } int ext4_check_blockref(const char *function, unsigned int line, struct inode *inode, __le32 *p, unsigned int max) { __le32 *bref = p; unsigned int blk; if (ext4_has_feature_journal(inode->i_sb) && (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && unlikely(!ext4_inode_block_valid(inode, blk, 1))) { ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; } } return 0; }
26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; /* * Check do_open_execat() for an explanation. */ error = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) goto exit; fsnotify_open(file); error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; int ret; unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { ret = expand_downwards(bprm->vma, pos); if (ret < 0) return NULL; } #endif if (write) gup_flags |= FOLL_WRITE; /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, &page, NULL, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(bprm->vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; vma_set_anonymous(vma); if (mmap_write_lock_killable(mm)) { err = -EINTR; goto err_free; } /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; err: mmap_write_unlock(mm); err_free: bprm->vma = NULL; vm_area_free(vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; bprm->argmin = bprm->p - limit; return 0; } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwords. */ pos = bprm->p; str += len; bprm->p -= len; #ifdef CONFIG_MMU if (bprm->p < bprm->argmin) goto out; #endif while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; char *kaddr; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); flush_dcache_page(page); kunmap_atomic(kaddr); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. The process proceeds as follows: * * 1) Use shift to calculate the new vma endpoints. * 2) Extend vma to cover both the old and new ranges. This ensures the * arguments passed to subsequent functions are consistent. * 3) Move vma's page tables to the new range. * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) { struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; unsigned long old_end = vma->vm_end; unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; struct mmu_gather tlb; BUG_ON(new_start > new_end); /* * ensure there are no vmas between where we want to go * and where we are */ if (vma != find_vma(mm, new_start)) return -EFAULT; /* * cover the whole range: [new_start, old_end) */ if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* * move the page tables downwards, on failure we rely on * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); tlb_gather_mmu(&tlb, mm); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch * the address space in [new_end, old_start) some architectures * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); /* * Shrink the vma to just the new range. Always succeeds. */ vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); return 0; } /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ if (current->flags & PF_RANDOMIZE) stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; else stack_base = vma->vm_end + stack_expand; #else if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_end - rlim_stack; else stack_base = vma->vm_start - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap(bprm->page[index]); if (ret) goto out; } bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) return file; /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ err = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) goto exit; err = deny_write_access(file); if (err) goto exit; if (name->name[0] != '\0') fsnotify_open(file); return file; exit: fput(file); return ERR_PTR(err); } struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going * through with the exec. We must hold mmap_lock around * checking core_state and changing tsk->mm. */ mmap_read_lock(old_mm); if (unlikely(old_mm->core_state)) { mmap_read_unlock(old_mm); up_write(&tsk->signal->exec_update_lock); return -EINTR; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if (signal_group_exit(sig)) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exit_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exit_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer wont't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exit_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exit_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update * to be lockless. */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; /* * Ensure that the uaccess routines can actually operate on userspace * pointers: */ force_uaccess_begin(); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct user_namespace *mnt_userns = file_mnt_user_ns(file); if (inode_permission(mnt_userns, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(&current->signal->cred_guard_mutex); return -ENOMEM; } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(&current->signal->cred_guard_mutex); abort_creds(bprm->cred); } if (bprm->file) { allow_write_access(bprm->file); fput(bprm->file); } if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) { struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); int retval = -ENOMEM; if (!bprm) goto out; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!bprm->fdpath) goto out_free; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) goto out_free; return bprm; out_free: free_bprm(bprm); out: return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct user_namespace *mnt_userns; struct inode *inode; unsigned int mode; kuid_t uid; kgid_t gid; int err; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; inode = file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; mnt_userns = file_mnt_user_ns(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; uid = i_uid_into_mnt(mnt_userns, inode); gid = i_gid_into_mnt(mnt_userns, inode); err = inode_permission(mnt_userns, inode, MAY_EXEC); inode_unlock(inode); /* Did the exec bit vanish out from under us? Give up. */ if (err) return; /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || !kgid_has_mapping(bprm->cred->user_ns, gid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = uid; } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = gid; } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { int ret = 0; unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) { ret = -EFAULT; goto out; } kaddr = kmap_atomic(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_atomic(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; ret = 0; out: return ret; } EXPORT_SYMBOL(remove_arg_zero); #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; } return retval; } static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } /* * sys_execve() executes a new program. */ static int bprm_execve(struct linux_binprm *bprm, int fd, struct filename *filename, int flags) { struct file *file; int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; check_unsafe_exec(bprm); current->in_execve = 1; file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; sched_exec(); bprm->file = file; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (bprm->fdpath && get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); out_unmark: current->fs->in_exec = 0; current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; } retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm, fd, filename, 0); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif
47 47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_ipt.c iptables target interface * *TODO: Add other tables. For now we only support the ipv4 table targets * * Copyright: Jamal Hadi Salim (2002-13) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_ipt.h> #include <net/tc_act/tc_ipt.h> #include <linux/netfilter_ipv4/ip_tables.h> static unsigned int ipt_net_id; static struct tc_action_ops act_ipt_ops; static unsigned int xt_net_id; static struct tc_action_ops act_xt_ops; static int ipt_init_target(struct net *net, struct xt_entry_target *t, char *table, unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; struct ipt_entry e = {}; int ret = 0; target = xt_request_find_target(AF_INET, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) return PTR_ERR(target); t->u.kernel.target = target; memset(&par, 0, sizeof(par)); par.net = net; par.table = table; par.entryinfo = &e; par.target = target; par.targinfo = t->data; par.hook_mask = 1 << hook; par.family = NFPROTO_IPV4; ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); if (ret < 0) { module_put(t->u.kernel.target->me); return ret; } return 0; } static void ipt_destroy_target(struct xt_entry_target *t, struct net *net) { struct xt_tgdtor_param par = { .target = t->u.kernel.target, .targinfo = t->data, .family = NFPROTO_IPV4, .net = net, }; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); } static void tcf_ipt_release(struct tc_action *a) { struct tcf_ipt *ipt = to_ipt(a); if (ipt->tcfi_t) { ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net); kfree(ipt->tcfi_t); } kfree(ipt->tcfi_tname); } static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_IPT_HOOK] = NLA_POLICY_RANGE(NLA_U32, NF_INET_PRE_ROUTING, NF_INET_NUMHOOKS), [TCA_IPT_INDEX] = { .type = NLA_U32 }, [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; bool exists = false; int ret = 0, err; u32 hook = 0; u32 index = 0; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_IPT_MAX, nla, ipt_policy, NULL); if (err < 0) return err; if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return 0; if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, false, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } err = -EINVAL; hook = nla_get_u32(tb[TCA_IPT_HOOK]); switch (hook) { case NF_INET_PRE_ROUTING: break; case NF_INET_POST_ROUTING: break; default: goto err1; } if (tb[TCA_IPT_TABLE]) { /* mangle only for now */ if (nla_strcmp(tb[TCA_IPT_TABLE], "mangle")) goto err1; } tname = kstrdup("mangle", GFP_KERNEL); if (unlikely(!tname)) goto err1; t = kmemdup(td, td->u.target_size, GFP_KERNEL); if (unlikely(!t)) goto err2; err = ipt_init_target(net, t, tname, hook); if (err < 0) goto err3; ipt = to_ipt(*a); spin_lock_bh(&ipt->tcf_lock); if (ret != ACT_P_CREATED) { ipt_destroy_target(ipt->tcfi_t, net); kfree(ipt->tcfi_tname); kfree(ipt->tcfi_t); } ipt->tcfi_tname = tname; ipt->tcfi_t = t; ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); return ret; err3: kfree(t); err2: kfree(tname); err1: tcf_idr_release(*a, bind); return err; } static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *ipt = to_ipt(a); struct xt_action_param par; struct nf_hook_state state = { .net = dev_net(skb->dev), .in = skb->dev, .hook = ipt->tcfi_hook, .pf = NFPROTO_IPV4, }; if (skb_unclone(skb, GFP_ATOMIC)) return TC_ACT_UNSPEC; spin_lock(&ipt->tcf_lock); tcf_lastuse_update(&ipt->tcf_tm); bstats_update(&ipt->tcf_bstats, skb); /* yes, we have to worry about both in and out dev * worry later - danger - this API seems to have changed * from earlier kernels */ par.state = &state; par.target = ipt->tcfi_t->u.kernel.target; par.targinfo = ipt->tcfi_t->data; ret = par.target->target(skb, &par); switch (ret) { case NF_ACCEPT: result = TC_ACT_OK; break; case NF_DROP: result = TC_ACT_SHOT; ipt->tcf_qstats.drops++; break; case XT_CONTINUE: result = TC_ACT_PIPE; break; default: net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", ret); result = TC_ACT_OK; break; } spin_unlock(&ipt->tcf_lock); return result; } static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ipt *ipt = to_ipt(a); struct xt_entry_target *t; struct tcf_t tm; struct tc_cnt c; /* for simple targets kernel size == user size * user name = target name * for foolproof you need to not assume this */ spin_lock_bh(&ipt->tcf_lock); t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); if (unlikely(!t)) goto nla_put_failure; c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) || nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) || nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) goto nla_put_failure; tcf_tm_dump(&tm, &ipt->tcf_tm); if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; spin_unlock_bh(&ipt->tcf_lock); kfree(t); return skb->len; nla_put_failure: spin_unlock_bh(&ipt->tcf_lock); nlmsg_trim(skb, b); kfree(t); return -1; } static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ipt_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ipt_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .id = TCA_ID_IPT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, .size = sizeof(struct tcf_ipt), }; static __net_init int ipt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, ipt_net_id); return tc_action_net_init(net, tn, &act_ipt_ops); } static void __net_exit ipt_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, ipt_net_id); } static struct pernet_operations ipt_net_ops = { .init = ipt_init_net, .exit_batch = ipt_exit_net, .id = &ipt_net_id, .size = sizeof(struct tc_action_net), }; static int tcf_xt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, xt_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, xt_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_xt_ops = { .kind = "xt", .id = TCA_ID_XT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, .size = sizeof(struct tcf_ipt), }; static __net_init int xt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, xt_net_id); return tc_action_net_init(net, tn, &act_xt_ops); } static void __net_exit xt_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, xt_net_id); } static struct pernet_operations xt_net_ops = { .init = xt_init_net, .exit_batch = xt_exit_net, .id = &xt_net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); MODULE_DESCRIPTION("Iptables target actions"); MODULE_LICENSE("GPL"); MODULE_ALIAS("act_xt"); static int __init ipt_init_module(void) { int ret1, ret2; ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops); if (ret1 < 0) pr_err("Failed to load xt action\n"); ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops); if (ret2 < 0) pr_err("Failed to load ipt action\n"); if (ret1 < 0 && ret2 < 0) { return ret1; } else return 0; } static void __exit ipt_cleanup_module(void) { tcf_unregister_action(&act_ipt_ops, &ipt_net_ops); tcf_unregister_action(&act_xt_ops, &xt_net_ops); } module_init(ipt_init_module); module_exit(ipt_cleanup_module);
4 4 4 4 2 1 1 1 1 5 1 4 4 1 3 1 7 7 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/skmsg.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> #include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { bool apply = apply_bytes; struct scatterlist *sge; u32 size, copied = 0; struct sk_msg *tmp; int i, ret = 0; tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!tmp)) return -ENOMEM; lock_sock(sk); tmp->sg.start = msg->sg.start; i = msg->sg.start; do { sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; } sk_mem_charge(sk, size); atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) get_page(sk_msg_page(tmp, i)); sk_msg_iter_var_next(i); tmp->sg.end = i; if (apply) { apply_bytes -= size; if (!apply_bytes) { if (sge->length) sk_msg_iter_var_prev(i); break; } } } while (i != msg->sg.end); if (!ret) { msg->sg.start = i; if (!sk_psock_queue_msg(psock, tmp)) atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); } release_sock(sk); return ret; } static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { bool apply = apply_bytes; struct scatterlist *sge; struct page *page; int size, ret = 0; u32 off; while (1) { bool has_tx_ulp; sge = sk_msg_elem(msg, msg->sg.start); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; off = sge->offset; page = sg_page(sge); tcp_rate_check_app_limited(sk); retry: has_tx_ulp = tls_sw_has_ctx_tx(sk); if (has_tx_ulp) { flags |= MSG_SENDPAGE_NOPOLICY; ret = kernel_sendpage_locked(sk, page, off, size, flags); } else { ret = do_tcp_sendpages(sk, page, off, size, flags); } if (ret <= 0) return ret; if (apply) apply_bytes -= ret; msg->sg.size -= ret; sge->offset += ret; sge->length -= ret; if (uncharge) sk_mem_uncharge(sk, ret); if (ret != size) { size -= ret; off += ret; goto retry; } if (!sge->length) { put_page(page); sk_msg_iter_next(msg, start); sg_init_table(sge, 1); if (msg->sg.start == msg->sg.end) break; } if (apply && !apply_bytes) break; } return 0; } static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { int ret; lock_sock(sk); ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); release_sock(sk); return ret; } int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags) { struct sk_psock *psock = sk_psock_get(sk); int ret; if (unlikely(!psock)) return -EPIPE; ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, !list_empty(&psock->ingress_msg) || !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static bool is_next_msg_fin(struct sk_psock *psock) { struct scatterlist *sge; struct sk_msg *msg_rx; int i; msg_rx = sk_psock_peek_msg(psock); i = msg_rx->sg.start; sge = sk_msg_elem(msg_rx, i); if (!sge->length) { struct sk_buff *skb = msg_rx->skb; if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) return true; } return false; } static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct sk_psock *psock; int copied; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); /* We may have received data on the sk_receive_queue pre-accept and * then we can not use read_skb in this context because we haven't * assigned a sk_socket yet so have no link to the ops. The work-around * is to check the sk_receive_queue and in these cases read skbs off * queue again. The read_skb hook is not running at this point because * of lock_sock so we avoid having multiple runners in read_skb. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { tcp_data_ready(sk); /* This handles the ENOMEM errors if we both receive data * pre accept and are already under memory pressure. At least * let user know to retry. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { copied = -EAGAIN; goto out; } } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. * On fin return correct return code to zero. */ if (copied == -EFAULT) { bool is_fin = is_next_msg_fin(psock); if (is_fin) { copied = 0; goto out; } } if (!copied) { long timeo; int data; if (sock_flag(sk, SOCK_DONE)) goto out; if (sk->sk_err) { copied = sock_error(sk); goto out; } if (sk->sk_shutdown & RCV_SHUTDOWN) goto out; if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, nonblock); if (!timeo) { copied = -EAGAIN; goto out; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); goto out; } data = tcp_msg_wait_data(sk, psock, timeo); if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; } out: release_sock(sk); sk_psock_put(sk, psock); return copied; } static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); } lock_sock(sk); msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, nonblock); data = tcp_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); } copied = -EAGAIN; } ret = copied; release_sock(sk); sk_psock_put(sk, psock); return ret; } static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; u32 eval; int ret; more_data: if (psock->eval == __SK_NONE) { /* Track delta in msg size to add/subtract it on SK_DROP from * returned to user copied size. This ensures user doesn't * get a positive return code with msg_cut_data and SK_DROP * verdict. */ delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); delta -= msg->sg.size; } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { psock->cork_bytes = msg->cork_bytes - msg->sg.size; if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); if (!psock->cork) return -ENOMEM; } memcpy(psock->cork, msg, sizeof(*msg)); return 0; } tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: ret = tcp_bpf_push(sk, msg, tosend, flags, true); if (unlikely(ret)) { *copied -= sk_msg_free(sk, msg); break; } sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); if (!psock->apply_bytes) { /* Clean up before releasing the sock lock. */ eval = psock->eval; psock->eval = __SK_NONE; psock->sk_redir = NULL; } if (psock->cork) { cork = true; psock->cork = NULL; } release_sock(sk); origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, msg, tosend, flags); sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); lock_sock(sk); sk_mem_uncharge(sk, sent); if (unlikely(ret < 0)) { int free = sk_msg_free(sk, msg); if (!cork) *copied -= free; } if (cork) { sk_msg_free(sk, msg); kfree(msg); msg = NULL; ret = 0; } break; case __SK_DROP: default: sk_msg_free(sk, msg); sk_msg_apply_bytes(psock, tosend); *copied -= (tosend + delta); return -EACCES; } if (likely(!ret)) { if (!psock->apply_bytes) { psock->eval = __SK_NONE; if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } } if (msg && msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) goto more_data; } return ret; } static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; int copied = 0, err = 0; struct sk_psock *psock; long timeo; int flags; /* Don't let internal do_tcp_sendpages() flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_sendmsg(sk, msg, size); lock_sock(sk); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); while (msg_data_left(msg)) { bool enospc = false; u32 copy, osize; if (sk->sk_err) { err = -sk->sk_err; goto out_err; } copy = msg_data_left(msg); if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; if (psock->cork) { msg_tx = psock->cork; } else { msg_tx = &tmp; sk_msg_init(msg_tx); } osize = msg_tx->sg.size; err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); if (err) { if (err != -ENOSPC) goto wait_for_memory; enospc = true; copy = msg_tx->sg.size - osize; } err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); if (err < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } copied += copy; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; else psock->cork_bytes -= size; if (psock->cork_bytes && !enospc) goto out_err; /* All cork bytes are accounted, rerun the prog. */ psock->eval = __SK_NONE; psock->cork_bytes = 0; } err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); if (unlikely(err < 0)) goto out_err; continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (msg_tx && msg_tx != psock->cork) sk_msg_free(sk, msg_tx); goto out_err; } } out_err: if (err < 0) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); sk_psock_put(sk, psock); return copied > 0 ? copied : err; } static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct sk_msg tmp, *msg = NULL; int err = 0, copied = 0; struct sk_psock *psock; bool enospc = false; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_sendpage(sk, page, offset, size, flags); lock_sock(sk); if (psock->cork) { msg = psock->cork; } else { msg = &tmp; sk_msg_init(msg); } /* Catch case where ring is full and sendpage is stalled. */ if (unlikely(sk_msg_full(msg))) goto out_err; sk_msg_page_add(msg, page, size, offset); sk_mem_charge(sk, size); copied = size; if (sk_msg_full(msg)) enospc = true; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; else psock->cork_bytes -= size; if (psock->cork_bytes && !enospc) goto out_err; /* All cork bytes are accounted, rerun the prog. */ psock->eval = __SK_NONE; psock->cork_bytes = 0; } err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); out_err: release_sock(sk); sk_psock_put(sk, psock); return copied ? copied : err; } enum { TCP_BPF_IPV4, TCP_BPF_IPV6, TCP_BPF_NUM_PROTS, }; enum { TCP_BPF_BASE, TCP_BPF_TX, TCP_BPF_RX, TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; static struct proto *tcpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(tcpv6_prot_lock); static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); smp_store_release(&tcpv6_prot_saved, ops); } spin_unlock_bh(&tcpv6_prot_lock); } } static int __init tcp_bpf_v4_build_proto(void) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); return 0; } late_initcall(tcp_bpf_v4_build_proto); static int tcp_bpf_assert_proto_ops(struct proto *ops) { /* In order to avoid retpoline, we make assumptions when we call * into ops if e.g. a psock is not present. Make sure they are * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && ops->sendmsg == tcp_sendmsg && ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; if (psock->progs.stream_verdict || psock->progs.skb_verdict) { config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; } if (restore) { if (inet_csk_has_ulp(sk)) { /* TLS does not have an unhash proto in SW cases, * but we need to ensure we stop using the sock_map * unhash routine because the associated psock is being * removed. So use the original unhash handler. */ WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } else { sk->sk_write_space = psock->saved_write_space; /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, psock->sk_proto); } return 0; } if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); return 0; } EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to * the default ones because the child does not inherit the psock state * that tcp_bpf callbacks expect. */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { struct proto *prot = newsk->sk_prot; if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "blk-mq.h" #include "blk-mq-tag.h" #define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async); void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_requests(struct request_queue *q); static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return false; return __blk_mq_sched_bio_merge(q, bio, nr_segs); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { struct elevator_queue *e = q->elevator; if (e && e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { struct elevator_queue *e = rq->q->elevator; if (e && e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } static inline void blk_mq_sched_requeue_request(struct request *rq) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } #endif
7 7 7 3 3 3 3 3 3 23 23 23 5 23 23 23 5 23 7 6 5 6 5 7 7 7 7 7 3 7 7 7 7 18 18 18 18 18 18 18 18 12 12 12 12 10 2 3 3 2 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <net/ip.h> #include <net/netlink.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/addrconf.h> #endif #include "br_private.h" static bool br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { *timer = br_timer_value(&pmctx->ip4_mc_router_timer); return !hlist_unhashed(&pmctx->ip4_rlist); } static bool br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { #if IS_ENABLED(CONFIG_IPV6) *timer = br_timer_value(&pmctx->ip6_mc_router_timer); return !hlist_unhashed(&pmctx->ip6_rlist); #else *timer = 0; return false; #endif } static size_t __br_rports_one_size(void) { return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */ nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */ nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */ } size_t br_rports_size(const struct net_bridge_mcast *brmctx) { struct net_bridge_mcast_port *pmctx; size_t size = nla_total_size(0); /* MDBA_ROUTER */ rcu_read_lock(); hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) size += __br_rports_one_size(); #if IS_ENABLED(CONFIG_IPV6) hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) size += __br_rports_one_size(); #endif rcu_read_unlock(); return size; } int br_rports_fill_info(struct sk_buff *skb, const struct net_bridge_mcast *brmctx) { u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0; bool have_ip4_mc_rtr, have_ip6_mc_rtr; unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; struct net_bridge_port *p; if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; list_for_each_entry_rcu(p, &brmctx->br->port_list, list) { struct net_bridge_mcast_port *pmctx; if (vid) { struct net_bridge_vlan *v; v = br_vlan_find(nbp_vlan_group(p), vid); if (!v) continue; pmctx = &v->port_mcast_ctx; } else { pmctx = &p->multicast_ctx; } have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer); have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer); if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, p->multicast_ctx.multicast_router) || (have_ip4_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, ip4_timer)) || (have_ip6_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, ip6_timer)) || (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) { nla_nest_cancel(skb, port_nest); goto fail; } nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); return 0; fail: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) { e->state = flags & MDB_PG_FLAGS_PERMANENT; e->flags = 0; if (flags & MDB_PG_FLAGS_OFFLOAD) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; if (flags & MDB_PG_FLAGS_STAR_EXCL) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; switch (ip->proto) { case htons(ETH_P_IP): ip->dst.ip4 = entry->addr.u.ip4; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip->dst.ip6 = entry->addr.u.ip6; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #endif default: ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr); } } static int __mdb_fill_srcs(struct sk_buff *skb, struct net_bridge_port_group *p) { struct net_bridge_group_src *ent; struct nlattr *nest, *nest_ent; if (hlist_empty(&p->src_list)) return 0; nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); if (!nest) return -EMSGSIZE; hlist_for_each_entry_rcu(ent, &p->src_list, node, lockdep_is_held(&p->key.port->br->multicast_lock)) { nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); if (!nest_ent) goto out_cancel_err; switch (ent->addr.proto) { case htons(ETH_P_IP): if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, ent->addr.src.ip4)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, &ent->addr.src.ip6)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #endif default: nla_nest_cancel(skb, nest_ent); continue; } if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, br_timer_value(&ent->timer))) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest); return 0; out_cancel_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; u8 flags = 0; int ifindex; memset(&e, 0, sizeof(e)); if (p) { ifindex = p->key.port->dev->ifindex; mtimer = &p->timer; flags = p->flags; } else { ifindex = mp->br->dev->ifindex; mtimer = &mp->timer; } __mdb_entry_fill_flags(&e, flags); e.ifindex = ifindex; e.vid = mp->addr.vid; if (mp->addr.proto == htons(ETH_P_IP)) e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) else if (mp->addr.proto == htons(ETH_P_IPV6)) e.addr.u.ip6 = mp->addr.dst.ip6; #endif else ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); if (!nest_ent) return -EMSGSIZE; if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, br_timer_value(mtimer))) goto nest_err; switch (mp->addr.proto) { case htons(ETH_P_IP): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) goto nest_err; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) goto nest_err; break; } break; #endif default: ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); } if (p) { if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) goto nest_err; if (dump_srcs_mode && (__mdb_fill_srcs(skb, p) || nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) goto nest_err; } nla_nest_end(skb, nest_ent); return 0; nest_err: nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; if (idx < s_idx) goto skip; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!nest2) { err = -EMSGSIZE; break; } if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); break; } } for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { if (!p->key.port) continue; if (pidx < s_pidx) goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { nla_nest_end(skb, nest2); goto out; } skip_pg: pidx++; } pidx = 0; s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; } out: cb->args[1] = idx; cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } static int br_mdb_valid_dump_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct br_port_msg *bpm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for mdb dump request"); return -EINVAL; } bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG_MOD(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*bpm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); return -EINVAL; } return 0; } static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh = NULL; int idx = 0, s_idx; if (cb->strict_check) { int err = br_mdb_valid_dump_req(cb->nlh, cb->extack); if (err < 0) return err; } s_idx = cb->args[0]; rcu_read_lock(); cb->seq = net->dev_base_seq; for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_EBRIDGE) { struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; if (idx < s_idx) goto skip; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_GETMDB, sizeof(*bpm), NLM_F_MULTI); if (nlh == NULL) break; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->ifindex = dev->ifindex; if (br_mdb_fill_info(skb, cb, dev) < 0) goto out; if (br_rports_fill_info(skb, &br->multicast_ctx) < 0) goto out; cb->args[1] = 0; nlmsg_end(skb, nlh); skip: idx++; } } out: if (nlh) nlmsg_end(skb, nlh); rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct nlmsghdr *nlh; struct br_port_msg *bpm; struct nlattr *nest, *nest2; nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) goto cancel; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (nest2 == NULL) goto end; if (__mdb_fill_info(skb, mp, pg)) goto end; nla_nest_end(skb, nest2); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) { size_t nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) + nla_total_size(sizeof(struct br_mdb_entry)) + nla_total_size(sizeof(u32)); struct net_bridge_group_src *ent; size_t addr_size = 0; if (!pg) goto out; /* MDBA_MDB_EATTR_RTPROT */ nlmsg_size += nla_total_size(sizeof(u8)); switch (pg->key.addr.proto) { case htons(ETH_P_IP): /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; #endif } /* MDBA_MDB_EATTR_GROUP_MODE */ nlmsg_size += nla_total_size(sizeof(u8)); /* MDBA_MDB_EATTR_SRC_LIST nested attr */ if (!hlist_empty(&pg->src_list)) nlmsg_size += nla_total_size(0); hlist_for_each_entry(ent, &pg->src_list, node) { /* MDBA_MDB_SRCLIST_ENTRY nested attr + * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER */ nlmsg_size += nla_total_size(0) + nla_total_size(addr_size) + nla_total_size(sizeof(u32)); } out: return nlmsg_size; } struct br_mdb_complete_info { struct net_bridge_port *port; struct br_ip ip; }; static void br_mdb_complete(struct net_device *dev, int err, void *priv) { struct br_mdb_complete_info *data = priv; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; if (err) goto err; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); if (!mp) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port != port) continue; p->flags |= MDB_PG_FLAGS_OFFLOAD; } out: spin_unlock_bh(&br->multicast_lock); err: kfree(priv); } static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, const struct net_bridge_mdb_entry *mp) { if (mp->addr.proto == htons(ETH_P_IP)) ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); #if IS_ENABLED(CONFIG_IPV6) else if (mp->addr.proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); #endif else ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); mdb->vid = mp->addr.vid; } static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, const struct switchdev_obj_port_mdb *mdb, unsigned long action, const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, .ctx = ctx, }, .obj = &mdb->obj, }; int err; err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } static int br_mdb_queue_one(struct list_head *mdb_list, enum switchdev_obj_id id, const struct net_bridge_mdb_entry *mp, struct net_device *orig_dev) { struct switchdev_obj_port_mdb *mdb; mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC); if (!mdb) return -ENOMEM; mdb->obj.id = id; mdb->obj.orig_dev = orig_dev; br_switchdev_mdb_populate(mdb, mp); list_add_tail(&mdb->obj.list, mdb_list); return 0; } int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { const struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; struct net_bridge *br; unsigned long action; LIST_HEAD(mdb_list); int err = 0; ASSERT_RTNL(); if (!nb) return 0; if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) return -EINVAL; br = netdev_priv(br_dev); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; /* We cannot walk over br->mdb_list protected just by the rtnl_mutex, * because the write-side protection is br->multicast_lock. But we * need to emulate the [ blocking ] calling context of a regular * switchdev event, so since both br->multicast_lock and RCU read side * critical sections are atomic, we have no choice but to pick the RCU * read side lock, queue up all our events, leave the critical section * and notify switchdev from blocking context. */ rcu_read_lock(); hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group __rcu * const *pp; const struct net_bridge_port_group *p; if (mp->host_joined) { err = br_mdb_queue_one(&mdb_list, SWITCHDEV_OBJ_ID_HOST_MDB, mp, br_dev); if (err) { rcu_read_unlock(); goto out_free_mdb; } } for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { if (p->key.port->dev != dev) continue; err = br_mdb_queue_one(&mdb_list, SWITCHDEV_OBJ_ID_PORT_MDB, mp, dev); if (err) { rcu_read_unlock(); goto out_free_mdb; } } } rcu_read_unlock(); if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; list_for_each_entry(obj, &mdb_list, list) { err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); if (err) goto out_free_mdb; } out_free_mdb: list_for_each_entry_safe(obj, tmp, &mdb_list, list) { list_del(&obj->list); kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); } return err; } static void br_mdb_switchdev_host_port(struct net_device *dev, struct net_device *lower_dev, struct net_bridge_mdb_entry *mp, int type) { struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_HOST_MDB, .flags = SWITCHDEV_F_DEFER, .orig_dev = dev, }, }; br_switchdev_mdb_populate(&mdb, mp); switch (type) { case RTM_NEWMDB: switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); break; case RTM_DELMDB: switchdev_port_obj_del(lower_dev, &mdb.obj); break; } } static void br_mdb_switchdev_host(struct net_device *dev, struct net_bridge_mdb_entry *mp, int type) { struct net_device *lower_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, lower_dev, iter) br_mdb_switchdev_host_port(dev, lower_dev, mp, type); } void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct br_mdb_complete_info *complete_info; struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_PORT_MDB, .flags = SWITCHDEV_F_DEFER, }, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; if (pg) { br_switchdev_mdb_populate(&mdb, mp); mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); if (!complete_info) break; complete_info->port = pg->key.port; complete_info->ip = mp->addr; mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_mdb_complete; if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL)) kfree(complete_info); break; case RTM_DELMDB: switchdev_port_obj_del(pg->key.port->dev, &mdb.obj); break; } } else { br_mdb_switchdev_host(dev, mp, type); } skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (!nest) goto cancel; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto end; if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { nla_nest_cancel(skb, port_nest); goto end; } if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { nla_nest_cancel(skb, port_nest); goto end; } nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + nla_total_size(sizeof(__u32)) + nla_total_size(sizeof(u16)); } void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; u16 vid; ifindex = pmctx ? pmctx->port->dev->ifindex : 0; vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static bool is_valid_mdb_entry(struct br_mdb_entry *entry, struct netlink_ext_ack *extack) { if (entry->ifindex == 0) { NL_SET_ERR_MSG_MOD(extack, "Zero entry ifindex is not allowed"); return false; } if (entry->addr.proto == htons(ETH_P_IP)) { if (!ipv4_is_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is not multicast"); return false; } if (ipv4_is_local_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is local multicast"); return false; } #if IS_ENABLED(CONFIG_IPV6) } else if (entry->addr.proto == htons(ETH_P_IPV6)) { if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 entry group address is link-local all nodes"); return false; } #endif } else if (entry->addr.proto == 0) { /* L2 mdb */ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { NL_SET_ERR_MSG_MOD(extack, "L2 entry group is not multicast"); return false; } } else { NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol"); return false; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG_MOD(extack, "Unknown entry state"); return false; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG_MOD(extack, "Invalid entry VLAN id"); return false; } return true; } static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, struct netlink_ext_ack *extack) { switch (proto) { case htons(ETH_P_IP): if (nla_len(attr) != sizeof(struct in_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); return false; } if (ipv4_is_multicast(nla_get_in_addr(attr))) { NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); return false; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr src; if (nla_len(attr) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); return false; } src = nla_get_in6_addr(attr); if (ipv6_addr_is_multicast(&src)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); return false; } break; } #endif default: NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); return false; } return true; } static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device **pdev, struct br_mdb_entry **pentry, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_mdb_entry *entry; struct br_port_msg *bpm; struct nlattr *tb[MDBA_SET_ENTRY_MAX+1]; struct net_device *dev; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, NULL, NULL); if (err < 0) return err; bpm = nlmsg_data(nlh); if (bpm->ifindex == 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid bridge ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (dev == NULL) { NL_SET_ERR_MSG_MOD(extack, "Bridge device doesn't exist"); return -ENODEV; } if (!(dev->priv_flags & IFF_EBRIDGE)) { NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge"); return -EOPNOTSUPP; } *pdev = dev; if (!tb[MDBA_SET_ENTRY]) { NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length"); return -EINVAL; } entry = nla_data(tb[MDBA_SET_ENTRY]); if (!is_valid_mdb_entry(entry, extack)) return -EINVAL; *pentry = entry; if (tb[MDBA_SET_ENTRY_ATTRS]) { err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, tb[MDBA_SET_ENTRY_ATTRS], br_mdbe_attrs_pol, extack); if (err) return err; if (mdb_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], entry->addr.proto, extack)) return -EINVAL; } else { memset(mdb_attrs, 0, sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1)); } return 0; } static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, struct netlink_ext_ack *extack) { struct net_bridge_mcast *brmctx = NULL; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { brmctx = &br->multicast_ctx; goto out; } if (!entry->vid) { NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); goto out; } v = br_vlan_find(br_vlan_group(br), entry->vid); if (!v) { NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); goto out; } if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); goto out; } brmctx = &v->br_mcast_ctx; out: return brmctx; } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_mdb_entry *entry, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp, *star_mp; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; struct net_bridge_mcast *brmctx; struct br_ip group, star_group; unsigned long now = jiffies; unsigned char flags = 0; u8 filter_mode; int err; __mdb_entry_to_br_ip(entry, &group, mdb_attrs); brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) return -EINVAL; /* host join errors which can happen before creating the group */ if (!port) { /* don't allow any flags for host-joined groups */ if (entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); return -EINVAL; } } if (br_group_is_l2(&group) && entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); return -EINVAL; } mp = br_mdb_ip_get(br, &group); if (!mp) { mp = br_multicast_new_group(br, &group); err = PTR_ERR_OR_ZERO(mp); if (err) return err; } /* host join */ if (!port) { if (mp->host_joined) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port == port) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by port"); return -EEXIST; } if ((unsigned long)p->key.port < (unsigned long)port) break; } filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : MCAST_INCLUDE; if (entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; p = br_multicast_new_port_group(port, &group, *pp, flags, NULL, filter_mode, RTPROT_STATIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); return -ENOMEM; } rcu_assign_pointer(*pp, p); if (entry->state == MDB_TEMPORARY) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); /* if we are adding a new EXCLUDE port group (*,G) it needs to be also * added to all S,G entries for proper replication, if we are adding * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be * added to it for proper replication */ if (br_multicast_should_handle_mode(brmctx, group.proto)) { switch (filter_mode) { case MCAST_EXCLUDE: br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); break; case MCAST_INCLUDE: star_group = p->key.addr; memset(&star_group.src, 0, sizeof(star_group.src)); star_mp = br_mdb_ip_get(br, &star_group); if (star_mp) br_multicast_sg_add_exclude_ports(star_mp, p); break; } } return 0; } static int __br_mdb_add(struct net *net, struct net_bridge *br, struct net_bridge_port *p, struct br_mdb_entry *entry, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { int ret; spin_lock_bh(&br->multicast_lock); ret = br_mdb_add_group(br, p, entry, mdb_attrs, extack); spin_unlock_bh(&br->multicast_lock); return ret; } static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_vlan *v; struct net_bridge *br; int err; err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; br = netdev_priv(dev); if (!netif_running(br->dev)) { NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); return -EINVAL; } if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); return -EINVAL; } if (entry->ifindex != br->dev->ifindex) { pdev = __dev_get_by_index(net, entry->ifindex); if (!pdev) { NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); return -ENODEV; } p = br_port_get_rtnl(pdev); if (!p) { NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); return -EINVAL; } if (p->br != br) { NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; } if (p->state == BR_STATE_DISABLED) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state"); return -EINVAL; } vg = nbp_vlan_group(p); } else { vg = br_vlan_group(br); } /* If vlan filtering is enabled and VLAN is not specified * install mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); if (err) break; } } else { err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); } return err; } static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry, struct nlattr **mdb_attrs) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip ip; int err = -EINVAL; if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return -EINVAL; __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; /* host leave */ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) { br_multicast_host_leave(mp, false); err = 0; br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); goto unlock; } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex) continue; if (p->key.port->state == BR_STATE_DISABLED) goto unlock; br_multicast_del_pg(mp, p, pp); err = 0; break; } unlock: spin_unlock_bh(&br->multicast_lock); return err; } static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_vlan *v; struct net_bridge *br; int err; err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; br = netdev_priv(dev); if (entry->ifindex != br->dev->ifindex) { pdev = __dev_get_by_index(net, entry->ifindex); if (!pdev) return -ENODEV; p = br_port_get_rtnl(pdev); if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; vg = nbp_vlan_group(p); } else { vg = br_vlan_group(br); } /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; err = __br_mdb_del(br, entry, mdb_attrs); } } else { err = __br_mdb_del(br, entry, mdb_attrs); } return err; } void br_mdb_init(void) { rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0); rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0); rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0); } void br_mdb_uninit(void) { rtnl_unregister(PF_BRIDGE, RTM_GETMDB); rtnl_unregister(PF_BRIDGE, RTM_NEWMDB); rtnl_unregister(PF_BRIDGE, RTM_DELMDB); }
164 973 955 18 39 165 125 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned/packed_struct.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix -- mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitray value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += __get_unaligned_cpu32(k); b += __get_unaligned_cpu32(k + 4); c += __get_unaligned_cpu32(k + 8); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitray value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 /* * Copyright (c) 2018 Cumulus Networks. All rights reserved. * Copyright (c) 2018 David Ahern <dsa@cumulusnetworks.com> * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/etherdevice.h> #include <linux/inet.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rtnetlink.h> #include <linux/workqueue.h> #include <net/devlink.h> #include <net/ip.h> #include <net/flow_offload.h> #include <uapi/linux/devlink.h> #include <uapi/linux/ip.h> #include <uapi/linux/udp.h> #include "netdevsim.h" static unsigned int nsim_dev_port_index(enum nsim_dev_port_type type, unsigned int port_index) { switch (type) { case NSIM_DEV_PORT_TYPE_VF: port_index = NSIM_DEV_VF_PORT_INDEX_BASE + port_index; break; case NSIM_DEV_PORT_TYPE_PF: break; } return port_index; } static inline unsigned int nsim_dev_port_index_to_vf_index(unsigned int port_index) { return port_index - NSIM_DEV_VF_PORT_INDEX_BASE; } static struct dentry *nsim_dev_ddir; #define NSIM_DEV_DUMMY_REGION_SIZE (1024 * 32) static int nsim_dev_take_snapshot(struct devlink *devlink, const struct devlink_region_ops *ops, struct netlink_ext_ack *extack, u8 **data) { void *dummy_data; dummy_data = kmalloc(NSIM_DEV_DUMMY_REGION_SIZE, GFP_KERNEL); if (!dummy_data) return -ENOMEM; get_random_bytes(dummy_data, NSIM_DEV_DUMMY_REGION_SIZE); *data = dummy_data; return 0; } static ssize_t nsim_dev_take_snapshot_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; struct devlink *devlink; u8 *dummy_data; int err; u32 id; devlink = priv_to_devlink(nsim_dev); err = nsim_dev_take_snapshot(devlink, NULL, NULL, &dummy_data); if (err) return err; err = devlink_region_snapshot_id_get(devlink, &id); if (err) { pr_err("Failed to get snapshot id\n"); kfree(dummy_data); return err; } err = devlink_region_snapshot_create(nsim_dev->dummy_region, dummy_data, id); devlink_region_snapshot_id_put(devlink, id); if (err) { pr_err("Failed to create region snapshot\n"); kfree(dummy_data); return err; } return count; } static const struct file_operations nsim_dev_take_snapshot_fops = { .open = simple_open, .write = nsim_dev_take_snapshot_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static ssize_t nsim_dev_trap_fa_cookie_read(struct file *file, char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; struct flow_action_cookie *fa_cookie; unsigned int buf_len; ssize_t ret; char *buf; spin_lock(&nsim_dev->fa_cookie_lock); fa_cookie = nsim_dev->fa_cookie; if (!fa_cookie) { ret = -EINVAL; goto errout; } buf_len = fa_cookie->cookie_len * 2; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret = -ENOMEM; goto errout; } bin2hex(buf, fa_cookie->cookie, fa_cookie->cookie_len); spin_unlock(&nsim_dev->fa_cookie_lock); ret = simple_read_from_buffer(data, count, ppos, buf, buf_len); kfree(buf); return ret; errout: spin_unlock(&nsim_dev->fa_cookie_lock); return ret; } static ssize_t nsim_dev_trap_fa_cookie_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; struct flow_action_cookie *fa_cookie; size_t cookie_len; ssize_t ret; char *buf; if (*ppos != 0) return -EINVAL; cookie_len = (count - 1) / 2; if ((count - 1) % 2) return -EINVAL; buf = memdup_user(data, count); if (IS_ERR(buf)) return PTR_ERR(buf); fa_cookie = kmalloc(sizeof(*fa_cookie) + cookie_len, GFP_KERNEL | __GFP_NOWARN); if (!fa_cookie) { ret = -ENOMEM; goto free_buf; } fa_cookie->cookie_len = cookie_len; ret = hex2bin(fa_cookie->cookie, buf, cookie_len); if (ret) goto free_fa_cookie; kfree(buf); spin_lock(&nsim_dev->fa_cookie_lock); kfree(nsim_dev->fa_cookie); nsim_dev->fa_cookie = fa_cookie; spin_unlock(&nsim_dev->fa_cookie_lock); return count; free_fa_cookie: kfree(fa_cookie); free_buf: kfree(buf); return ret; } static const struct file_operations nsim_dev_trap_fa_cookie_fops = { .open = simple_open, .read = nsim_dev_trap_fa_cookie_read, .write = nsim_dev_trap_fa_cookie_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static const struct file_operations nsim_dev_max_vfs_fops = { .open = simple_open, .read = nsim_bus_dev_max_vfs_read, .write = nsim_bus_dev_max_vfs_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static int nsim_dev_debugfs_init(struct nsim_dev *nsim_dev) { char dev_ddir_name[sizeof(DRV_NAME) + 10]; int err; sprintf(dev_ddir_name, DRV_NAME "%u", nsim_dev->nsim_bus_dev->dev.id); nsim_dev->ddir = debugfs_create_dir(dev_ddir_name, nsim_dev_ddir); if (IS_ERR(nsim_dev->ddir)) return PTR_ERR(nsim_dev->ddir); nsim_dev->ports_ddir = debugfs_create_dir("ports", nsim_dev->ddir); if (IS_ERR(nsim_dev->ports_ddir)) { err = PTR_ERR(nsim_dev->ports_ddir); goto err_ddir; } debugfs_create_bool("fw_update_status", 0600, nsim_dev->ddir, &nsim_dev->fw_update_status); debugfs_create_u32("fw_update_overwrite_mask", 0600, nsim_dev->ddir, &nsim_dev->fw_update_overwrite_mask); debugfs_create_u32("max_macs", 0600, nsim_dev->ddir, &nsim_dev->max_macs); debugfs_create_bool("test1", 0600, nsim_dev->ddir, &nsim_dev->test1); nsim_dev->take_snapshot = debugfs_create_file("take_snapshot", 0200, nsim_dev->ddir, nsim_dev, &nsim_dev_take_snapshot_fops); debugfs_create_bool("dont_allow_reload", 0600, nsim_dev->ddir, &nsim_dev->dont_allow_reload); debugfs_create_bool("fail_reload", 0600, nsim_dev->ddir, &nsim_dev->fail_reload); debugfs_create_file("trap_flow_action_cookie", 0600, nsim_dev->ddir, nsim_dev, &nsim_dev_trap_fa_cookie_fops); debugfs_create_bool("fail_trap_group_set", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_group_set); debugfs_create_bool("fail_trap_policer_set", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_policer_set); debugfs_create_bool("fail_trap_policer_counter_get", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_policer_counter_get); nsim_dev->max_vfs = debugfs_create_file("max_vfs", 0600, nsim_dev->ddir, nsim_dev->nsim_bus_dev, &nsim_dev_max_vfs_fops); nsim_dev->nodes_ddir = debugfs_create_dir("rate_nodes", nsim_dev->ddir); if (IS_ERR(nsim_dev->nodes_ddir)) { err = PTR_ERR(nsim_dev->nodes_ddir); goto err_ports_ddir; } debugfs_create_bool("fail_trap_drop_counter_get", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_drop_counter_get); nsim_udp_tunnels_debugfs_create(nsim_dev); return 0; err_ports_ddir: debugfs_remove_recursive(nsim_dev->ports_ddir); err_ddir: debugfs_remove_recursive(nsim_dev->ddir); return err; } static void nsim_dev_debugfs_exit(struct nsim_dev *nsim_dev) { debugfs_remove_recursive(nsim_dev->nodes_ddir); debugfs_remove_recursive(nsim_dev->ports_ddir); debugfs_remove_recursive(nsim_dev->ddir); } static ssize_t nsim_dev_rate_parent_read(struct file *file, char __user *data, size_t count, loff_t *ppos) { char **name_ptr = file->private_data; size_t len; if (!*name_ptr) return 0; len = strlen(*name_ptr); return simple_read_from_buffer(data, count, ppos, *name_ptr, len); } static const struct file_operations nsim_dev_rate_parent_fops = { .open = simple_open, .read = nsim_dev_rate_parent_read, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) { struct nsim_bus_dev *nsim_bus_dev = nsim_dev->nsim_bus_dev; unsigned int port_index = nsim_dev_port->port_index; char port_ddir_name[16]; char dev_link_name[32]; sprintf(port_ddir_name, "%u", port_index); nsim_dev_port->ddir = debugfs_create_dir(port_ddir_name, nsim_dev->ports_ddir); if (IS_ERR(nsim_dev_port->ddir)) return PTR_ERR(nsim_dev_port->ddir); sprintf(dev_link_name, "../../../" DRV_NAME "%u", nsim_bus_dev->dev.id); if (nsim_dev_port_is_vf(nsim_dev_port)) { unsigned int vf_id = nsim_dev_port_index_to_vf_index(port_index); debugfs_create_u16("tx_share", 0400, nsim_dev_port->ddir, &nsim_bus_dev->vfconfigs[vf_id].min_tx_rate); debugfs_create_u16("tx_max", 0400, nsim_dev_port->ddir, &nsim_bus_dev->vfconfigs[vf_id].max_tx_rate); nsim_dev_port->rate_parent = debugfs_create_file("rate_parent", 0400, nsim_dev_port->ddir, &nsim_dev_port->parent_name, &nsim_dev_rate_parent_fops); } debugfs_create_symlink("dev", nsim_dev_port->ddir, dev_link_name); return 0; } static void nsim_dev_port_debugfs_exit(struct nsim_dev_port *nsim_dev_port) { debugfs_remove_recursive(nsim_dev_port->ddir); } static int nsim_dev_resources_register(struct devlink *devlink) { struct devlink_resource_size_params params = { .size_max = (u64)-1, .size_granularity = 1, .unit = DEVLINK_RESOURCE_UNIT_ENTRY }; int err; /* Resources for IPv4 */ err = devlink_resource_register(devlink, "IPv4", (u64)-1, NSIM_RESOURCE_IPV4, DEVLINK_RESOURCE_ID_PARENT_TOP, &params); if (err) { pr_err("Failed to register IPv4 top resource\n"); goto out; } err = devlink_resource_register(devlink, "fib", (u64)-1, NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4, &params); if (err) { pr_err("Failed to register IPv4 FIB resource\n"); return err; } err = devlink_resource_register(devlink, "fib-rules", (u64)-1, NSIM_RESOURCE_IPV4_FIB_RULES, NSIM_RESOURCE_IPV4, &params); if (err) { pr_err("Failed to register IPv4 FIB rules resource\n"); return err; } /* Resources for IPv6 */ err = devlink_resource_register(devlink, "IPv6", (u64)-1, NSIM_RESOURCE_IPV6, DEVLINK_RESOURCE_ID_PARENT_TOP, &params); if (err) { pr_err("Failed to register IPv6 top resource\n"); goto out; } err = devlink_resource_register(devlink, "fib", (u64)-1, NSIM_RESOURCE_IPV6_FIB, NSIM_RESOURCE_IPV6, &params); if (err) { pr_err("Failed to register IPv6 FIB resource\n"); return err; } err = devlink_resource_register(devlink, "fib-rules", (u64)-1, NSIM_RESOURCE_IPV6_FIB_RULES, NSIM_RESOURCE_IPV6, &params); if (err) { pr_err("Failed to register IPv6 FIB rules resource\n"); return err; } /* Resources for nexthops */ err = devlink_resource_register(devlink, "nexthops", (u64)-1, NSIM_RESOURCE_NEXTHOPS, DEVLINK_RESOURCE_ID_PARENT_TOP, &params); out: return err; } enum nsim_devlink_param_id { NSIM_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, NSIM_DEVLINK_PARAM_ID_TEST1, }; static const struct devlink_param nsim_devlink_params[] = { DEVLINK_PARAM_GENERIC(MAX_MACS, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, NULL), DEVLINK_PARAM_DRIVER(NSIM_DEVLINK_PARAM_ID_TEST1, "test1", DEVLINK_PARAM_TYPE_BOOL, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, NULL), }; static void nsim_devlink_set_params_init_values(struct nsim_dev *nsim_dev, struct devlink *devlink) { union devlink_param_value value; value.vu32 = nsim_dev->max_macs; devlink_param_driverinit_value_set(devlink, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, value); value.vbool = nsim_dev->test1; devlink_param_driverinit_value_set(devlink, NSIM_DEVLINK_PARAM_ID_TEST1, value); } static void nsim_devlink_param_load_driverinit_values(struct devlink *devlink) { struct nsim_dev *nsim_dev = devlink_priv(devlink); union devlink_param_value saved_value; int err; err = devlink_param_driverinit_value_get(devlink, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, &saved_value); if (!err) nsim_dev->max_macs = saved_value.vu32; err = devlink_param_driverinit_value_get(devlink, NSIM_DEVLINK_PARAM_ID_TEST1, &saved_value); if (!err) nsim_dev->test1 = saved_value.vbool; } #define NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX 16 static const struct devlink_region_ops dummy_region_ops = { .name = "dummy", .destructor = &kfree, .snapshot = nsim_dev_take_snapshot, }; static int nsim_dev_dummy_region_init(struct nsim_dev *nsim_dev, struct devlink *devlink) { nsim_dev->dummy_region = devlink_region_create(devlink, &dummy_region_ops, NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX, NSIM_DEV_DUMMY_REGION_SIZE); return PTR_ERR_OR_ZERO(nsim_dev->dummy_region); } static void nsim_dev_dummy_region_exit(struct nsim_dev *nsim_dev) { devlink_region_destroy(nsim_dev->dummy_region); } static void __nsim_dev_port_del(struct nsim_dev_port *nsim_dev_port); int nsim_esw_legacy_enable(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack) { struct devlink *devlink = priv_to_devlink(nsim_dev); struct nsim_dev_port *nsim_dev_port, *tmp; devlink_rate_nodes_destroy(devlink); mutex_lock(&nsim_dev->port_list_lock); list_for_each_entry_safe(nsim_dev_port, tmp, &nsim_dev->port_list, list) if (nsim_dev_port_is_vf(nsim_dev_port)) __nsim_dev_port_del(nsim_dev_port); mutex_unlock(&nsim_dev->port_list_lock); nsim_dev->esw_mode = DEVLINK_ESWITCH_MODE_LEGACY; return 0; } int nsim_esw_switchdev_enable(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack) { struct nsim_bus_dev *nsim_bus_dev = nsim_dev->nsim_bus_dev; int i, err; for (i = 0; i < nsim_bus_dev->num_vfs; i++) { err = nsim_dev_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_VF, i); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to initialize VFs' netdevsim ports"); pr_err("Failed to initialize VF id=%d. %d.\n", i, err); goto err_port_add_vfs; } } nsim_dev->esw_mode = DEVLINK_ESWITCH_MODE_SWITCHDEV; return 0; err_port_add_vfs: for (i--; i >= 0; i--) nsim_dev_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_VF, i); return err; } static int nsim_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); int err = 0; mutex_lock(&nsim_dev->nsim_bus_dev->vfs_lock); if (mode == nsim_dev->esw_mode) goto unlock; if (mode == DEVLINK_ESWITCH_MODE_LEGACY) err = nsim_esw_legacy_enable(nsim_dev, extack); else if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) err = nsim_esw_switchdev_enable(nsim_dev, extack); else err = -EINVAL; unlock: mutex_unlock(&nsim_dev->nsim_bus_dev->vfs_lock); return err; } static int nsim_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) { struct nsim_dev *nsim_dev = devlink_priv(devlink); *mode = nsim_dev->esw_mode; return 0; } struct nsim_trap_item { void *trap_ctx; enum devlink_trap_action action; }; struct nsim_trap_data { struct delayed_work trap_report_dw; struct nsim_trap_item *trap_items_arr; u64 *trap_policers_cnt_arr; u64 trap_pkt_cnt; struct nsim_dev *nsim_dev; spinlock_t trap_lock; /* Protects trap_items_arr */ }; /* All driver-specific traps must be documented in * Documentation/networking/devlink/netdevsim.rst */ enum { NSIM_TRAP_ID_BASE = DEVLINK_TRAP_GENERIC_ID_MAX, NSIM_TRAP_ID_FID_MISS, }; #define NSIM_TRAP_NAME_FID_MISS "fid_miss" #define NSIM_TRAP_METADATA DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT #define NSIM_TRAP_DROP(_id, _group_id) \ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_TRAP_DROP_EXT(_id, _group_id, _metadata) \ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA | (_metadata)) #define NSIM_TRAP_EXCEPTION(_id, _group_id) \ DEVLINK_TRAP_GENERIC(EXCEPTION, TRAP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_TRAP_CONTROL(_id, _group_id, _action) \ DEVLINK_TRAP_GENERIC(CONTROL, _action, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_TRAP_DRIVER_EXCEPTION(_id, _group_id) \ DEVLINK_TRAP_DRIVER(EXCEPTION, TRAP, NSIM_TRAP_ID_##_id, \ NSIM_TRAP_NAME_##_id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_DEV_TRAP_POLICER_MIN_RATE 1 #define NSIM_DEV_TRAP_POLICER_MAX_RATE 8000 #define NSIM_DEV_TRAP_POLICER_MIN_BURST 8 #define NSIM_DEV_TRAP_POLICER_MAX_BURST 65536 #define NSIM_TRAP_POLICER(_id, _rate, _burst) \ DEVLINK_TRAP_POLICER(_id, _rate, _burst, \ NSIM_DEV_TRAP_POLICER_MAX_RATE, \ NSIM_DEV_TRAP_POLICER_MIN_RATE, \ NSIM_DEV_TRAP_POLICER_MAX_BURST, \ NSIM_DEV_TRAP_POLICER_MIN_BURST) static const struct devlink_trap_policer nsim_trap_policers_arr[] = { NSIM_TRAP_POLICER(1, 1000, 128), NSIM_TRAP_POLICER(2, 2000, 256), NSIM_TRAP_POLICER(3, 3000, 512), }; static const struct devlink_trap_group nsim_trap_groups_arr[] = { DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0), DEVLINK_TRAP_GROUP_GENERIC(L3_DROPS, 1), DEVLINK_TRAP_GROUP_GENERIC(L3_EXCEPTIONS, 1), DEVLINK_TRAP_GROUP_GENERIC(BUFFER_DROPS, 2), DEVLINK_TRAP_GROUP_GENERIC(ACL_DROPS, 3), DEVLINK_TRAP_GROUP_GENERIC(MC_SNOOPING, 3), }; static const struct devlink_trap nsim_traps_arr[] = { NSIM_TRAP_DROP(SMAC_MC, L2_DROPS), NSIM_TRAP_DROP(VLAN_TAG_MISMATCH, L2_DROPS), NSIM_TRAP_DROP(INGRESS_VLAN_FILTER, L2_DROPS), NSIM_TRAP_DROP(INGRESS_STP_FILTER, L2_DROPS), NSIM_TRAP_DROP(EMPTY_TX_LIST, L2_DROPS), NSIM_TRAP_DROP(PORT_LOOPBACK_FILTER, L2_DROPS), NSIM_TRAP_DRIVER_EXCEPTION(FID_MISS, L2_DROPS), NSIM_TRAP_DROP(BLACKHOLE_ROUTE, L3_DROPS), NSIM_TRAP_EXCEPTION(TTL_ERROR, L3_EXCEPTIONS), NSIM_TRAP_DROP(TAIL_DROP, BUFFER_DROPS), NSIM_TRAP_DROP_EXT(INGRESS_FLOW_ACTION_DROP, ACL_DROPS, DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE), NSIM_TRAP_DROP_EXT(EGRESS_FLOW_ACTION_DROP, ACL_DROPS, DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE), NSIM_TRAP_CONTROL(IGMP_QUERY, MC_SNOOPING, MIRROR), NSIM_TRAP_CONTROL(IGMP_V1_REPORT, MC_SNOOPING, TRAP), }; #define NSIM_TRAP_L4_DATA_LEN 100 static struct sk_buff *nsim_dev_trap_skb_build(void) { int tot_len, data_len = NSIM_TRAP_L4_DATA_LEN; struct sk_buff *skb; struct udphdr *udph; struct ethhdr *eth; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb) return NULL; tot_len = sizeof(struct iphdr) + sizeof(struct udphdr) + data_len; skb_reset_mac_header(skb); eth = skb_put(skb, sizeof(struct ethhdr)); eth_random_addr(eth->h_dest); eth_random_addr(eth->h_source); eth->h_proto = htons(ETH_P_IP); skb->protocol = htons(ETH_P_IP); skb_set_network_header(skb, skb->len); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = IPPROTO_UDP; iph->saddr = in_aton("192.0.2.1"); iph->daddr = in_aton("198.51.100.1"); iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; iph->tot_len = htons(tot_len); iph->ttl = 100; iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb_set_transport_header(skb, skb->len); udph = skb_put_zero(skb, sizeof(struct udphdr) + data_len); get_random_bytes(&udph->source, sizeof(u16)); get_random_bytes(&udph->dest, sizeof(u16)); udph->len = htons(sizeof(struct udphdr) + data_len); return skb; } static void nsim_dev_trap_report(struct nsim_dev_port *nsim_dev_port) { struct nsim_dev *nsim_dev = nsim_dev_port->ns->nsim_dev; struct devlink *devlink = priv_to_devlink(nsim_dev); struct nsim_trap_data *nsim_trap_data; int i; nsim_trap_data = nsim_dev->trap_data; spin_lock(&nsim_trap_data->trap_lock); for (i = 0; i < ARRAY_SIZE(nsim_traps_arr); i++) { struct flow_action_cookie *fa_cookie = NULL; struct nsim_trap_item *nsim_trap_item; struct sk_buff *skb; bool has_fa_cookie; has_fa_cookie = nsim_traps_arr[i].metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE; nsim_trap_item = &nsim_trap_data->trap_items_arr[i]; if (nsim_trap_item->action == DEVLINK_TRAP_ACTION_DROP) continue; skb = nsim_dev_trap_skb_build(); if (!skb) continue; skb->dev = nsim_dev_port->ns->netdev; /* Trapped packets are usually passed to devlink in softIRQ, * but in this case they are generated in a workqueue. Disable * softIRQs to prevent lockdep from complaining about * "incosistent lock state". */ spin_lock_bh(&nsim_dev->fa_cookie_lock); fa_cookie = has_fa_cookie ? nsim_dev->fa_cookie : NULL; devlink_trap_report(devlink, skb, nsim_trap_item->trap_ctx, &nsim_dev_port->devlink_port, fa_cookie); spin_unlock_bh(&nsim_dev->fa_cookie_lock); consume_skb(skb); } spin_unlock(&nsim_trap_data->trap_lock); } #define NSIM_TRAP_REPORT_INTERVAL_MS 100 static void nsim_dev_trap_report_work(struct work_struct *work) { struct nsim_trap_data *nsim_trap_data; struct nsim_dev_port *nsim_dev_port; struct nsim_dev *nsim_dev; nsim_trap_data = container_of(work, struct nsim_trap_data, trap_report_dw.work); nsim_dev = nsim_trap_data->nsim_dev; /* For each running port and enabled packet trap, generate a UDP * packet with a random 5-tuple and report it. */ mutex_lock(&nsim_dev->port_list_lock); list_for_each_entry(nsim_dev_port, &nsim_dev->port_list, list) { if (!netif_running(nsim_dev_port->ns->netdev)) continue; nsim_dev_trap_report(nsim_dev_port); } mutex_unlock(&nsim_dev->port_list_lock); schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); } static int nsim_dev_traps_init(struct devlink *devlink) { size_t policers_count = ARRAY_SIZE(nsim_trap_policers_arr); struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_trap_data *nsim_trap_data; int err; nsim_trap_data = kzalloc(sizeof(*nsim_trap_data), GFP_KERNEL); if (!nsim_trap_data) return -ENOMEM; nsim_trap_data->trap_items_arr = kcalloc(ARRAY_SIZE(nsim_traps_arr), sizeof(struct nsim_trap_item), GFP_KERNEL); if (!nsim_trap_data->trap_items_arr) { err = -ENOMEM; goto err_trap_data_free; } nsim_trap_data->trap_policers_cnt_arr = kcalloc(policers_count, sizeof(u64), GFP_KERNEL); if (!nsim_trap_data->trap_policers_cnt_arr) { err = -ENOMEM; goto err_trap_items_free; } /* The lock is used to protect the action state of the registered * traps. The value is written by user and read in delayed work when * iterating over all the traps. */ spin_lock_init(&nsim_trap_data->trap_lock); nsim_trap_data->nsim_dev = nsim_dev; nsim_dev->trap_data = nsim_trap_data; err = devlink_trap_policers_register(devlink, nsim_trap_policers_arr, policers_count); if (err) goto err_trap_policers_cnt_free; err = devlink_trap_groups_register(devlink, nsim_trap_groups_arr, ARRAY_SIZE(nsim_trap_groups_arr)); if (err) goto err_trap_policers_unregister; err = devlink_traps_register(devlink, nsim_traps_arr, ARRAY_SIZE(nsim_traps_arr), NULL); if (err) goto err_trap_groups_unregister; INIT_DELAYED_WORK(&nsim_dev->trap_data->trap_report_dw, nsim_dev_trap_report_work); schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); return 0; err_trap_groups_unregister: devlink_trap_groups_unregister(devlink, nsim_trap_groups_arr, ARRAY_SIZE(nsim_trap_groups_arr)); err_trap_policers_unregister: devlink_trap_policers_unregister(devlink, nsim_trap_policers_arr, ARRAY_SIZE(nsim_trap_policers_arr)); err_trap_policers_cnt_free: kfree(nsim_trap_data->trap_policers_cnt_arr); err_trap_items_free: kfree(nsim_trap_data->trap_items_arr); err_trap_data_free: kfree(nsim_trap_data); return err; } static void nsim_dev_traps_exit(struct devlink *devlink) { struct nsim_dev *nsim_dev = devlink_priv(devlink); cancel_delayed_work_sync(&nsim_dev->trap_data->trap_report_dw); devlink_traps_unregister(devlink, nsim_traps_arr, ARRAY_SIZE(nsim_traps_arr)); devlink_trap_groups_unregister(devlink, nsim_trap_groups_arr, ARRAY_SIZE(nsim_trap_groups_arr)); devlink_trap_policers_unregister(devlink, nsim_trap_policers_arr, ARRAY_SIZE(nsim_trap_policers_arr)); kfree(nsim_dev->trap_data->trap_policers_cnt_arr); kfree(nsim_dev->trap_data->trap_items_arr); kfree(nsim_dev->trap_data); } static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack); static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev); static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_bus_dev *nsim_bus_dev; nsim_bus_dev = nsim_dev->nsim_bus_dev; if (!mutex_trylock(&nsim_bus_dev->nsim_bus_reload_lock)) return -EOPNOTSUPP; if (nsim_dev->dont_allow_reload) { /* For testing purposes, user set debugfs dont_allow_reload * value to true. So forbid it. */ NL_SET_ERR_MSG_MOD(extack, "User forbid the reload for testing purposes"); mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock); return -EOPNOTSUPP; } nsim_bus_dev->in_reload = true; nsim_dev_reload_destroy(nsim_dev); mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock); return 0; } static int nsim_dev_reload_up(struct devlink *devlink, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_bus_dev *nsim_bus_dev; int ret; nsim_bus_dev = nsim_dev->nsim_bus_dev; mutex_lock(&nsim_bus_dev->nsim_bus_reload_lock); nsim_bus_dev->in_reload = false; if (nsim_dev->fail_reload) { /* For testing purposes, user set debugfs fail_reload * value to true. Fail right away. */ NL_SET_ERR_MSG_MOD(extack, "User setup the reload to fail for testing purposes"); mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock); return -EINVAL; } *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); ret = nsim_dev_reload_create(nsim_dev, extack); mutex_unlock(&nsim_bus_dev->nsim_bus_reload_lock); return ret; } static int nsim_dev_info_get(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack) { return devlink_info_driver_name_put(req, DRV_NAME); } #define NSIM_DEV_FLASH_SIZE 500000 #define NSIM_DEV_FLASH_CHUNK_SIZE 1000 #define NSIM_DEV_FLASH_CHUNK_TIME_MS 10 static int nsim_dev_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); int i; if ((params->overwrite_mask & ~nsim_dev->fw_update_overwrite_mask) != 0) return -EOPNOTSUPP; if (nsim_dev->fw_update_status) { devlink_flash_update_status_notify(devlink, "Preparing to flash", params->component, 0, 0); } for (i = 0; i < NSIM_DEV_FLASH_SIZE / NSIM_DEV_FLASH_CHUNK_SIZE; i++) { if (nsim_dev->fw_update_status) devlink_flash_update_status_notify(devlink, "Flashing", params->component, i * NSIM_DEV_FLASH_CHUNK_SIZE, NSIM_DEV_FLASH_SIZE); msleep(NSIM_DEV_FLASH_CHUNK_TIME_MS); } if (nsim_dev->fw_update_status) { devlink_flash_update_status_notify(devlink, "Flashing", params->component, NSIM_DEV_FLASH_SIZE, NSIM_DEV_FLASH_SIZE); devlink_flash_update_timeout_notify(devlink, "Flash select", params->component, 81); devlink_flash_update_status_notify(devlink, "Flashing done", params->component, 0, 0); } return 0; } static struct nsim_trap_item * nsim_dev_trap_item_lookup(struct nsim_dev *nsim_dev, u16 trap_id) { struct nsim_trap_data *nsim_trap_data = nsim_dev->trap_data; int i; for (i = 0; i < ARRAY_SIZE(nsim_traps_arr); i++) { if (nsim_traps_arr[i].id == trap_id) return &nsim_trap_data->trap_items_arr[i]; } return NULL; } static int nsim_dev_devlink_trap_init(struct devlink *devlink, const struct devlink_trap *trap, void *trap_ctx) { struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_trap_item *nsim_trap_item; nsim_trap_item = nsim_dev_trap_item_lookup(nsim_dev, trap->id); if (WARN_ON(!nsim_trap_item)) return -ENOENT; nsim_trap_item->trap_ctx = trap_ctx; nsim_trap_item->action = trap->init_action; return 0; } static int nsim_dev_devlink_trap_action_set(struct devlink *devlink, const struct devlink_trap *trap, enum devlink_trap_action action, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_trap_item *nsim_trap_item; nsim_trap_item = nsim_dev_trap_item_lookup(nsim_dev, trap->id); if (WARN_ON(!nsim_trap_item)) return -ENOENT; spin_lock(&nsim_dev->trap_data->trap_lock); nsim_trap_item->action = action; spin_unlock(&nsim_dev->trap_data->trap_lock); return 0; } static int nsim_dev_devlink_trap_group_set(struct devlink *devlink, const struct devlink_trap_group *group, const struct devlink_trap_policer *policer, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); if (nsim_dev->fail_trap_group_set) return -EINVAL; return 0; } static int nsim_dev_devlink_trap_policer_set(struct devlink *devlink, const struct devlink_trap_policer *policer, u64 rate, u64 burst, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); if (nsim_dev->fail_trap_policer_set) { NL_SET_ERR_MSG_MOD(extack, "User setup the operation to fail for testing purposes"); return -EINVAL; } return 0; } static int nsim_dev_devlink_trap_policer_counter_get(struct devlink *devlink, const struct devlink_trap_policer *policer, u64 *p_drops) { struct nsim_dev *nsim_dev = devlink_priv(devlink); u64 *cnt; if (nsim_dev->fail_trap_policer_counter_get) return -EINVAL; cnt = &nsim_dev->trap_data->trap_policers_cnt_arr[policer->id - 1]; *p_drops = (*cnt)++; return 0; } #define NSIM_LINK_SPEED_MAX 5000 /* Mbps */ #define NSIM_LINK_SPEED_UNIT 125000 /* 1 Mbps given in bytes/sec to avoid * u64 overflow during conversion from * bytes to bits. */ static int nsim_rate_bytes_to_units(char *name, u64 *rate, struct netlink_ext_ack *extack) { u64 val; u32 rem; val = div_u64_rem(*rate, NSIM_LINK_SPEED_UNIT, &rem); if (rem) { pr_err("%s rate value %lluBps not in link speed units of 1Mbps.\n", name, *rate); NL_SET_ERR_MSG_MOD(extack, "TX rate value not in link speed units of 1Mbps."); return -EINVAL; } if (val > NSIM_LINK_SPEED_MAX) { pr_err("%s rate value %lluMbps exceed link maximum speed 5000Mbps.\n", name, val); NL_SET_ERR_MSG_MOD(extack, "TX rate value exceed link maximum speed 5000Mbps."); return -EINVAL; } *rate = val; return 0; } static int nsim_leaf_tx_share_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack) { struct nsim_dev_port *nsim_dev_port = priv; struct nsim_bus_dev *nsim_bus_dev = nsim_dev_port->ns->nsim_bus_dev; int vf_id = nsim_dev_port_index_to_vf_index(nsim_dev_port->port_index); int err; err = nsim_rate_bytes_to_units("tx_share", &tx_share, extack); if (err) return err; nsim_bus_dev->vfconfigs[vf_id].min_tx_rate = tx_share; return 0; } static int nsim_leaf_tx_max_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack) { struct nsim_dev_port *nsim_dev_port = priv; struct nsim_bus_dev *nsim_bus_dev = nsim_dev_port->ns->nsim_bus_dev; int vf_id = nsim_dev_port_index_to_vf_index(nsim_dev_port->port_index); int err; err = nsim_rate_bytes_to_units("tx_max", &tx_max, extack); if (err) return err; nsim_bus_dev->vfconfigs[vf_id].max_tx_rate = tx_max; return 0; } struct nsim_rate_node { struct dentry *ddir; struct dentry *rate_parent; char *parent_name; u16 tx_share; u16 tx_max; }; static int nsim_node_tx_share_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv; int err; err = nsim_rate_bytes_to_units("tx_share", &tx_share, extack); if (err) return err; nsim_node->tx_share = tx_share; return 0; } static int nsim_node_tx_max_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv; int err; err = nsim_rate_bytes_to_units("tx_max", &tx_max, extack); if (err) return err; nsim_node->tx_max = tx_max; return 0; } static int nsim_rate_node_new(struct devlink_rate *node, void **priv, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(node->devlink); struct nsim_rate_node *nsim_node; if (!nsim_esw_mode_is_switchdev(nsim_dev)) { NL_SET_ERR_MSG_MOD(extack, "Node creation allowed only in switchdev mode."); return -EOPNOTSUPP; } nsim_node = kzalloc(sizeof(*nsim_node), GFP_KERNEL); if (!nsim_node) return -ENOMEM; nsim_node->ddir = debugfs_create_dir(node->name, nsim_dev->nodes_ddir); debugfs_create_u16("tx_share", 0400, nsim_node->ddir, &nsim_node->tx_share); debugfs_create_u16("tx_max", 0400, nsim_node->ddir, &nsim_node->tx_max); nsim_node->rate_parent = debugfs_create_file("rate_parent", 0400, nsim_node->ddir, &nsim_node->parent_name, &nsim_dev_rate_parent_fops); *priv = nsim_node; return 0; } static int nsim_rate_node_del(struct devlink_rate *node, void *priv, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv; debugfs_remove(nsim_node->rate_parent); debugfs_remove_recursive(nsim_node->ddir); kfree(nsim_node); return 0; } static int nsim_rate_leaf_parent_set(struct devlink_rate *child, struct devlink_rate *parent, void *priv_child, void *priv_parent, struct netlink_ext_ack *extack) { struct nsim_dev_port *nsim_dev_port = priv_child; if (parent) nsim_dev_port->parent_name = parent->name; else nsim_dev_port->parent_name = NULL; return 0; } static int nsim_rate_node_parent_set(struct devlink_rate *child, struct devlink_rate *parent, void *priv_child, void *priv_parent, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv_child; if (parent) nsim_node->parent_name = parent->name; else nsim_node->parent_name = NULL; return 0; } static int nsim_dev_devlink_trap_drop_counter_get(struct devlink *devlink, const struct devlink_trap *trap, u64 *p_drops) { struct nsim_dev *nsim_dev = devlink_priv(devlink); u64 *cnt; if (nsim_dev->fail_trap_drop_counter_get) return -EINVAL; cnt = &nsim_dev->trap_data->trap_pkt_cnt; *p_drops = (*cnt)++; return 0; } static const struct devlink_ops nsim_dev_devlink_ops = { .eswitch_mode_set = nsim_devlink_eswitch_mode_set, .eswitch_mode_get = nsim_devlink_eswitch_mode_get, .supported_flash_update_params = DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT | DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK, .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = nsim_dev_reload_down, .reload_up = nsim_dev_reload_up, .info_get = nsim_dev_info_get, .flash_update = nsim_dev_flash_update, .trap_init = nsim_dev_devlink_trap_init, .trap_action_set = nsim_dev_devlink_trap_action_set, .trap_group_set = nsim_dev_devlink_trap_group_set, .trap_policer_set = nsim_dev_devlink_trap_policer_set, .trap_policer_counter_get = nsim_dev_devlink_trap_policer_counter_get, .rate_leaf_tx_share_set = nsim_leaf_tx_share_set, .rate_leaf_tx_max_set = nsim_leaf_tx_max_set, .rate_node_tx_share_set = nsim_node_tx_share_set, .rate_node_tx_max_set = nsim_node_tx_max_set, .rate_node_new = nsim_rate_node_new, .rate_node_del = nsim_rate_node_del, .rate_leaf_parent_set = nsim_rate_leaf_parent_set, .rate_node_parent_set = nsim_rate_node_parent_set, .trap_drop_counter_get = nsim_dev_devlink_trap_drop_counter_get, }; #define NSIM_DEV_MAX_MACS_DEFAULT 32 #define NSIM_DEV_TEST1_DEFAULT true static int __nsim_dev_port_add(struct nsim_dev *nsim_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct nsim_bus_dev *nsim_bus_dev = nsim_dev->nsim_bus_dev; struct devlink_port_attrs attrs = {}; struct nsim_dev_port *nsim_dev_port; struct devlink_port *devlink_port; int err; if (type == NSIM_DEV_PORT_TYPE_VF && !nsim_bus_dev->num_vfs) return -EINVAL; nsim_dev_port = kzalloc(sizeof(*nsim_dev_port), GFP_KERNEL); if (!nsim_dev_port) return -ENOMEM; nsim_dev_port->port_index = nsim_dev_port_index(type, port_index); nsim_dev_port->port_type = type; devlink_port = &nsim_dev_port->devlink_port; if (nsim_dev_port_is_pf(nsim_dev_port)) { attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = port_index + 1; } else { attrs.flavour = DEVLINK_PORT_FLAVOUR_PCI_VF; attrs.pci_vf.pf = 0; attrs.pci_vf.vf = port_index; } memcpy(attrs.switch_id.id, nsim_dev->switch_id.id, nsim_dev->switch_id.id_len); attrs.switch_id.id_len = nsim_dev->switch_id.id_len; devlink_port_attrs_set(devlink_port, &attrs); err = devlink_port_register(priv_to_devlink(nsim_dev), devlink_port, nsim_dev_port->port_index); if (err) goto err_port_free; err = nsim_dev_port_debugfs_init(nsim_dev, nsim_dev_port); if (err) goto err_dl_port_unregister; nsim_dev_port->ns = nsim_create(nsim_dev, nsim_dev_port); if (IS_ERR(nsim_dev_port->ns)) { err = PTR_ERR(nsim_dev_port->ns); goto err_port_debugfs_exit; } if (nsim_dev_port_is_vf(nsim_dev_port)) { err = devlink_rate_leaf_create(&nsim_dev_port->devlink_port, nsim_dev_port); if (err) goto err_nsim_destroy; } devlink_port_type_eth_set(devlink_port, nsim_dev_port->ns->netdev); list_add(&nsim_dev_port->list, &nsim_dev->port_list); return 0; err_nsim_destroy: nsim_destroy(nsim_dev_port->ns); err_port_debugfs_exit: nsim_dev_port_debugfs_exit(nsim_dev_port); err_dl_port_unregister: devlink_port_unregister(devlink_port); err_port_free: kfree(nsim_dev_port); return err; } static void __nsim_dev_port_del(struct nsim_dev_port *nsim_dev_port) { struct devlink_port *devlink_port = &nsim_dev_port->devlink_port; list_del(&nsim_dev_port->list); if (nsim_dev_port_is_vf(nsim_dev_port)) devlink_rate_leaf_destroy(&nsim_dev_port->devlink_port); devlink_port_type_clear(devlink_port); nsim_destroy(nsim_dev_port->ns); nsim_dev_port_debugfs_exit(nsim_dev_port); devlink_port_unregister(devlink_port); kfree(nsim_dev_port); } static void nsim_dev_port_del_all(struct nsim_dev *nsim_dev) { struct nsim_dev_port *nsim_dev_port, *tmp; mutex_lock(&nsim_dev->port_list_lock); list_for_each_entry_safe(nsim_dev_port, tmp, &nsim_dev->port_list, list) __nsim_dev_port_del(nsim_dev_port); mutex_unlock(&nsim_dev->port_list_lock); } static int nsim_dev_port_add_all(struct nsim_dev *nsim_dev, unsigned int port_count) { int i, err; for (i = 0; i < port_count; i++) { err = __nsim_dev_port_add(nsim_dev, NSIM_DEV_PORT_TYPE_PF, i); if (err) goto err_port_del_all; } return 0; err_port_del_all: nsim_dev_port_del_all(nsim_dev); return err; } static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack) { struct nsim_bus_dev *nsim_bus_dev = nsim_dev->nsim_bus_dev; struct devlink *devlink; int err; devlink = priv_to_devlink(nsim_dev); nsim_dev = devlink_priv(devlink); INIT_LIST_HEAD(&nsim_dev->port_list); mutex_init(&nsim_dev->port_list_lock); nsim_dev->fw_update_status = true; nsim_dev->fw_update_overwrite_mask = 0; nsim_devlink_param_load_driverinit_values(devlink); err = nsim_dev_dummy_region_init(nsim_dev, devlink); if (err) return err; err = nsim_dev_traps_init(devlink); if (err) goto err_dummy_region_exit; nsim_dev->fib_data = nsim_fib_create(devlink, extack); if (IS_ERR(nsim_dev->fib_data)) { err = PTR_ERR(nsim_dev->fib_data); goto err_traps_exit; } err = nsim_dev_health_init(nsim_dev, devlink); if (err) goto err_fib_destroy; err = nsim_dev_psample_init(nsim_dev); if (err) goto err_health_exit; err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); if (err) goto err_psample_exit; nsim_dev->take_snapshot = debugfs_create_file("take_snapshot", 0200, nsim_dev->ddir, nsim_dev, &nsim_dev_take_snapshot_fops); return 0; err_psample_exit: nsim_dev_psample_exit(nsim_dev); err_health_exit: nsim_dev_health_exit(nsim_dev); err_fib_destroy: nsim_fib_destroy(devlink, nsim_dev->fib_data); err_traps_exit: nsim_dev_traps_exit(devlink); err_dummy_region_exit: nsim_dev_dummy_region_exit(nsim_dev); return err; } int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev) { struct nsim_dev *nsim_dev; struct devlink *devlink; int err; devlink = devlink_alloc_ns(&nsim_dev_devlink_ops, sizeof(*nsim_dev), nsim_bus_dev->initial_net, &nsim_bus_dev->dev); if (!devlink) return -ENOMEM; nsim_dev = devlink_priv(devlink); nsim_dev->nsim_bus_dev = nsim_bus_dev; nsim_dev->switch_id.id_len = sizeof(nsim_dev->switch_id.id); get_random_bytes(nsim_dev->switch_id.id, nsim_dev->switch_id.id_len); INIT_LIST_HEAD(&nsim_dev->port_list); mutex_init(&nsim_dev->port_list_lock); nsim_dev->fw_update_status = true; nsim_dev->fw_update_overwrite_mask = 0; nsim_dev->max_macs = NSIM_DEV_MAX_MACS_DEFAULT; nsim_dev->test1 = NSIM_DEV_TEST1_DEFAULT; spin_lock_init(&nsim_dev->fa_cookie_lock); dev_set_drvdata(&nsim_bus_dev->dev, nsim_dev); err = nsim_dev_resources_register(devlink); if (err) goto err_devlink_free; err = devlink_register(devlink); if (err) goto err_resources_unregister; err = devlink_params_register(devlink, nsim_devlink_params, ARRAY_SIZE(nsim_devlink_params)); if (err) goto err_dl_unregister; nsim_devlink_set_params_init_values(nsim_dev, devlink); err = nsim_dev_dummy_region_init(nsim_dev, devlink); if (err) goto err_params_unregister; err = nsim_dev_traps_init(devlink); if (err) goto err_dummy_region_exit; err = nsim_dev_debugfs_init(nsim_dev); if (err) goto err_traps_exit; nsim_dev->fib_data = nsim_fib_create(devlink, NULL); if (IS_ERR(nsim_dev->fib_data)) { err = PTR_ERR(nsim_dev->fib_data); goto err_debugfs_exit; } err = nsim_dev_health_init(nsim_dev, devlink); if (err) goto err_fib_destroy; err = nsim_bpf_dev_init(nsim_dev); if (err) goto err_health_exit; err = nsim_dev_psample_init(nsim_dev); if (err) goto err_bpf_dev_exit; err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); if (err) goto err_psample_exit; devlink_params_publish(devlink); devlink_reload_enable(devlink); nsim_dev->esw_mode = DEVLINK_ESWITCH_MODE_LEGACY; return 0; err_psample_exit: nsim_dev_psample_exit(nsim_dev); err_bpf_dev_exit: nsim_bpf_dev_exit(nsim_dev); err_health_exit: nsim_dev_health_exit(nsim_dev); err_fib_destroy: nsim_fib_destroy(devlink, nsim_dev->fib_data); err_debugfs_exit: nsim_dev_debugfs_exit(nsim_dev); err_traps_exit: nsim_dev_traps_exit(devlink); err_dummy_region_exit: nsim_dev_dummy_region_exit(nsim_dev); err_params_unregister: devlink_params_unregister(devlink, nsim_devlink_params, ARRAY_SIZE(nsim_devlink_params)); err_dl_unregister: devlink_unregister(devlink); err_resources_unregister: devlink_resources_unregister(devlink, NULL); err_devlink_free: devlink_free(devlink); return err; } static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev) { struct devlink *devlink = priv_to_devlink(nsim_dev); if (devlink_is_reload_failed(devlink)) return; debugfs_remove(nsim_dev->take_snapshot); mutex_lock(&nsim_dev->nsim_bus_dev->vfs_lock); if (nsim_dev->nsim_bus_dev->num_vfs) nsim_bus_dev_vfs_disable(nsim_dev->nsim_bus_dev); mutex_unlock(&nsim_dev->nsim_bus_dev->vfs_lock); nsim_dev_port_del_all(nsim_dev); nsim_dev_psample_exit(nsim_dev); nsim_dev_health_exit(nsim_dev); nsim_fib_destroy(devlink, nsim_dev->fib_data); nsim_dev_traps_exit(devlink); nsim_dev_dummy_region_exit(nsim_dev); mutex_destroy(&nsim_dev->port_list_lock); } void nsim_dev_remove(struct nsim_bus_dev *nsim_bus_dev) { struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); struct devlink *devlink = priv_to_devlink(nsim_dev); devlink_reload_disable(devlink); nsim_dev_reload_destroy(nsim_dev); nsim_bpf_dev_exit(nsim_dev); nsim_dev_debugfs_exit(nsim_dev); devlink_params_unregister(devlink, nsim_devlink_params, ARRAY_SIZE(nsim_devlink_params)); devlink_unregister(devlink); devlink_resources_unregister(devlink, NULL); devlink_free(devlink); } static struct nsim_dev_port * __nsim_dev_port_lookup(struct nsim_dev *nsim_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct nsim_dev_port *nsim_dev_port; port_index = nsim_dev_port_index(type, port_index); list_for_each_entry(nsim_dev_port, &nsim_dev->port_list, list) if (nsim_dev_port->port_index == port_index) return nsim_dev_port; return NULL; } int nsim_dev_port_add(struct nsim_bus_dev *nsim_bus_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); int err; mutex_lock(&nsim_dev->port_list_lock); if (__nsim_dev_port_lookup(nsim_dev, type, port_index)) err = -EEXIST; else err = __nsim_dev_port_add(nsim_dev, type, port_index); mutex_unlock(&nsim_dev->port_list_lock); return err; } int nsim_dev_port_del(struct nsim_bus_dev *nsim_bus_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); struct nsim_dev_port *nsim_dev_port; int err = 0; mutex_lock(&nsim_dev->port_list_lock); nsim_dev_port = __nsim_dev_port_lookup(nsim_dev, type, port_index); if (!nsim_dev_port) err = -ENOENT; else __nsim_dev_port_del(nsim_dev_port); mutex_unlock(&nsim_dev->port_list_lock); return err; } int nsim_dev_init(void) { nsim_dev_ddir = debugfs_create_dir(DRV_NAME, NULL); return PTR_ERR_OR_ZERO(nsim_dev_ddir); } void nsim_dev_exit(void) { debugfs_remove_recursive(nsim_dev_ddir); }
254 141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _DELAYED_CALL_H #define _DELAYED_CALL_H /* * Poor man's closures; I wish we could've done them sanely polymorphic, * but... */ struct delayed_call { void (*fn)(void *); void *arg; }; #define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL} /* I really wish we had closures with sane typechecking... */ static inline void set_delayed_call(struct delayed_call *call, void (*fn)(void *), void *arg) { call->fn = fn; call->arg = arg; } static inline void do_delayed_call(struct delayed_call *call) { if (call->fn) call->fn(call->arg); } static inline void clear_delayed_call(struct delayed_call *call) { call->fn = NULL; } #endif
1651 197 217 28 3988 9 3989 3991 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/export.h> #include <linux/fault-inject-usercopy.h> #include <linux/kasan-checks.h> #include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <asm/word-at-a-time.h> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS #define IS_UNALIGNED(src, dst) 0 #else #define IS_UNALIGNED(src, dst) \ (((long) dst | (long) src) & (sizeof(long) - 1)) #endif /* * Do a strncpy, return length of string without final '\0'. * 'count' is the user-supplied count (return 'count' if we * hit it), 'max' is the address space maximum (and we return * -EFAULT if we hit it). */ static inline long do_strncpy_from_user(char *dst, const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long res = 0; if (IS_UNALIGNED(src, dst)) goto byte_at_a_time; while (max >= sizeof(unsigned long)) { unsigned long c, data, mask; /* Fall back to byte-at-a-time if we get a page fault */ unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); /* * Note that we mask out the bytes following the NUL. This is * important to do because string oblivious code may read past * the NUL. For those routines, we don't want to give them * potentially random bytes after the NUL in `src`. * * One example of such code is BPF map keys. BPF treats map keys * as an opaque set of bytes. Without the post-NUL mask, any BPF * maps keyed by strings returned from strncpy_from_user() may * have multiple entries for semantically identical strings. */ if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); mask = zero_bytemask(data); *(unsigned long *)(dst+res) = c & mask; return res + find_zero(data); } *(unsigned long *)(dst+res) = c; res += sizeof(unsigned long); max -= sizeof(unsigned long); } byte_at_a_time: while (max) { char c; unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; res++; max--; } /* * Uhhuh. We hit 'max'. But was that the user-specified maximum * too? If so, that's ok - we got as much as the user asked for. */ if (res >= count) return res; /* * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ efault: return -EFAULT; } /** * strncpy_from_user: - Copy a NUL terminated string from userspace. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @src: Source address, in user space. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from userspace to kernel space. * * On success, returns the length of the string (not including the trailing * NUL). * * If access to userspace fails, returns -EFAULT (some data may have been * copied). * * If @count is smaller than the length of the string, copies @count bytes * and returns @count. */ long strncpy_from_user(char *dst, const char __user *src, long count) { unsigned long max_addr, src_addr; might_fault(); if (should_fail_usercopy()) return -EFAULT; if (unlikely(count <= 0)) return 0; max_addr = user_addr_max(); src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; /* * Truncate 'max' to the user-specified limit, so that * we only have one limit we need to check in the loop */ if (max > count) max = count; kasan_check_write(dst, count); check_object_size(dst, count, false); if (user_read_access_begin(src, max)) { retval = do_strncpy_from_user(dst, src, count, max); user_read_access_end(); return retval; } } return -EFAULT; } EXPORT_SYMBOL(strncpy_from_user);
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 // SPDX-License-Identifier: GPL-2.0-only /* * ppp_deflate.c - interface the zlib procedures for Deflate compression * and decompression (as used by gzip) to the PPP code. * * Copyright 1994-1998 Paul Mackerras. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/string.h> #include <linux/ppp_defs.h> #include <linux/ppp-comp.h> #include <linux/zlib.h> #include <asm/unaligned.h> /* * State for a Deflate (de)compressor. */ struct ppp_deflate_state { int seqno; int w_size; int unit; int mru; int debug; z_stream strm; struct compstat stats; }; #define DEFLATE_OVHD 2 /* Deflate overhead/packet */ static void *z_comp_alloc(unsigned char *options, int opt_len); static void *z_decomp_alloc(unsigned char *options, int opt_len); static void z_comp_free(void *state); static void z_decomp_free(void *state); static int z_comp_init(void *state, unsigned char *options, int opt_len, int unit, int hdrlen, int debug); static int z_decomp_init(void *state, unsigned char *options, int opt_len, int unit, int hdrlen, int mru, int debug); static int z_compress(void *state, unsigned char *rptr, unsigned char *obuf, int isize, int osize); static void z_incomp(void *state, unsigned char *ibuf, int icnt); static int z_decompress(void *state, unsigned char *ibuf, int isize, unsigned char *obuf, int osize); static void z_comp_reset(void *state); static void z_decomp_reset(void *state); static void z_comp_stats(void *state, struct compstat *stats); /** * z_comp_free - free the memory used by a compressor * @arg: pointer to the private state for the compressor. */ static void z_comp_free(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (state) { zlib_deflateEnd(&state->strm); vfree(state->strm.workspace); kfree(state); } } /** * z_comp_alloc - allocate space for a compressor. * @options: pointer to CCP option data * @opt_len: length of the CCP option at @options. * * The @options pointer points to the a buffer containing the * CCP option data for the compression being negotiated. It is * formatted according to RFC1979, and describes the window * size that the peer is requesting that we use in compressing * data to be sent to it. * * Returns the pointer to the private state for the compressor, * or NULL if we could not allocate enough memory. */ static void *z_comp_alloc(unsigned char *options, int opt_len) { struct ppp_deflate_state *state; int w_size; if (opt_len != CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || options[3] != DEFLATE_CHK_SEQUENCE) return NULL; w_size = DEFLATE_SIZE(options[2]); if (w_size < DEFLATE_MIN_SIZE || w_size > DEFLATE_MAX_SIZE) return NULL; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) return NULL; state->strm.next_in = NULL; state->w_size = w_size; state->strm.workspace = vmalloc(zlib_deflate_workspacesize(-w_size, 8)); if (state->strm.workspace == NULL) goto out_free; if (zlib_deflateInit2(&state->strm, Z_DEFAULT_COMPRESSION, DEFLATE_METHOD_VAL, -w_size, 8, Z_DEFAULT_STRATEGY) != Z_OK) goto out_free; return (void *) state; out_free: z_comp_free(state); return NULL; } /** * z_comp_init - initialize a previously-allocated compressor. * @arg: pointer to the private state for the compressor * @options: pointer to the CCP option data describing the * compression that was negotiated with the peer * @opt_len: length of the CCP option data at @options * @unit: PPP unit number for diagnostic messages * @hdrlen: ignored (present for backwards compatibility) * @debug: debug flag; if non-zero, debug messages are printed. * * The CCP options described by @options must match the options * specified when the compressor was allocated. The compressor * history is reset. Returns 0 for failure (CCP options don't * match) or 1 for success. */ static int z_comp_init(void *arg, unsigned char *options, int opt_len, int unit, int hdrlen, int debug) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (opt_len < CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || DEFLATE_SIZE(options[2]) != state->w_size || options[3] != DEFLATE_CHK_SEQUENCE) return 0; state->seqno = 0; state->unit = unit; state->debug = debug; zlib_deflateReset(&state->strm); return 1; } /** * z_comp_reset - reset a previously-allocated compressor. * @arg: pointer to private state for the compressor. * * This clears the history for the compressor and makes it * ready to start emitting a new compressed stream. */ static void z_comp_reset(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; state->seqno = 0; zlib_deflateReset(&state->strm); } /** * z_compress - compress a PPP packet with Deflate compression. * @arg: pointer to private state for the compressor * @rptr: uncompressed packet (input) * @obuf: compressed packet (output) * @isize: size of uncompressed packet * @osize: space available at @obuf * * Returns the length of the compressed packet, or 0 if the * packet is incompressible. */ static int z_compress(void *arg, unsigned char *rptr, unsigned char *obuf, int isize, int osize) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int r, proto, off, olen, oavail; unsigned char *wptr; /* * Check that the protocol is in the range we handle. */ proto = PPP_PROTOCOL(rptr); if (proto > 0x3fff || proto == 0xfd || proto == 0xfb) return 0; /* Don't generate compressed packets which are larger than the uncompressed packet. */ if (osize > isize) osize = isize; wptr = obuf; /* * Copy over the PPP header and store the 2-byte sequence number. */ wptr[0] = PPP_ADDRESS(rptr); wptr[1] = PPP_CONTROL(rptr); put_unaligned_be16(PPP_COMP, wptr + 2); wptr += PPP_HDRLEN; put_unaligned_be16(state->seqno, wptr); wptr += DEFLATE_OVHD; olen = PPP_HDRLEN + DEFLATE_OVHD; state->strm.next_out = wptr; state->strm.avail_out = oavail = osize - olen; ++state->seqno; off = (proto > 0xff) ? 2 : 3; /* skip 1st proto byte if 0 */ rptr += off; state->strm.next_in = rptr; state->strm.avail_in = (isize - off); for (;;) { r = zlib_deflate(&state->strm, Z_PACKET_FLUSH); if (r != Z_OK) { if (state->debug) printk(KERN_ERR "z_compress: deflate returned %d\n", r); break; } if (state->strm.avail_out == 0) { olen += oavail; state->strm.next_out = NULL; state->strm.avail_out = oavail = 1000000; } else { break; /* all done */ } } olen += oavail - state->strm.avail_out; /* * See if we managed to reduce the size of the packet. */ if (olen < isize && olen <= osize) { state->stats.comp_bytes += olen; state->stats.comp_packets++; } else { state->stats.inc_bytes += isize; state->stats.inc_packets++; olen = 0; } state->stats.unc_bytes += isize; state->stats.unc_packets++; return olen; } /** * z_comp_stats - return compression statistics for a compressor * or decompressor. * @arg: pointer to private space for the (de)compressor * @stats: pointer to a struct compstat to receive the result. */ static void z_comp_stats(void *arg, struct compstat *stats) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; *stats = state->stats; } /** * z_decomp_free - Free the memory used by a decompressor. * @arg: pointer to private space for the decompressor. */ static void z_decomp_free(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (state) { vfree(state->strm.workspace); kfree(state); } } /** * z_decomp_alloc - allocate space for a decompressor. * @options: pointer to CCP option data * @opt_len: length of the CCP option at @options. * * The @options pointer points to the a buffer containing the * CCP option data for the compression being negotiated. It is * formatted according to RFC1979, and describes the window * size that we are requesting the peer to use in compressing * data to be sent to us. * * Returns the pointer to the private state for the decompressor, * or NULL if we could not allocate enough memory. */ static void *z_decomp_alloc(unsigned char *options, int opt_len) { struct ppp_deflate_state *state; int w_size; if (opt_len != CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || options[3] != DEFLATE_CHK_SEQUENCE) return NULL; w_size = DEFLATE_SIZE(options[2]); if (w_size < DEFLATE_MIN_SIZE || w_size > DEFLATE_MAX_SIZE) return NULL; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) return NULL; state->w_size = w_size; state->strm.next_out = NULL; state->strm.workspace = vmalloc(zlib_inflate_workspacesize()); if (state->strm.workspace == NULL) goto out_free; if (zlib_inflateInit2(&state->strm, -w_size) != Z_OK) goto out_free; return (void *) state; out_free: z_decomp_free(state); return NULL; } /** * z_decomp_init - initialize a previously-allocated decompressor. * @arg: pointer to the private state for the decompressor * @options: pointer to the CCP option data describing the * compression that was negotiated with the peer * @opt_len: length of the CCP option data at @options * @unit: PPP unit number for diagnostic messages * @hdrlen: ignored (present for backwards compatibility) * @mru: maximum length of decompressed packets * @debug: debug flag; if non-zero, debug messages are printed. * * The CCP options described by @options must match the options * specified when the decompressor was allocated. The decompressor * history is reset. Returns 0 for failure (CCP options don't * match) or 1 for success. */ static int z_decomp_init(void *arg, unsigned char *options, int opt_len, int unit, int hdrlen, int mru, int debug) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (opt_len < CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || DEFLATE_SIZE(options[2]) != state->w_size || options[3] != DEFLATE_CHK_SEQUENCE) return 0; state->seqno = 0; state->unit = unit; state->debug = debug; state->mru = mru; zlib_inflateReset(&state->strm); return 1; } /** * z_decomp_reset - reset a previously-allocated decompressor. * @arg: pointer to private state for the decompressor. * * This clears the history for the decompressor and makes it * ready to receive a new compressed stream. */ static void z_decomp_reset(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; state->seqno = 0; zlib_inflateReset(&state->strm); } /** * z_decompress - decompress a Deflate-compressed packet. * @arg: pointer to private state for the decompressor * @ibuf: pointer to input (compressed) packet data * @isize: length of input packet * @obuf: pointer to space for output (decompressed) packet * @osize: amount of space available at @obuf * * Because of patent problems, we return DECOMP_ERROR for errors * found by inspecting the input data and for system problems, but * DECOMP_FATALERROR for any errors which could possibly be said to * be being detected "after" decompression. For DECOMP_ERROR, * we can issue a CCP reset-request; for DECOMP_FATALERROR, we may be * infringing a patent of Motorola's if we do, so we take CCP down * instead. * * Given that the frame has the correct sequence number and a good FCS, * errors such as invalid codes in the input most likely indicate a * bug, so we return DECOMP_FATALERROR for them in order to turn off * compression, even though they are detected by inspecting the input. */ static int z_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, int osize) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int olen, seq, r; int decode_proto, overflow; unsigned char overflow_buf[1]; if (isize <= PPP_HDRLEN + DEFLATE_OVHD) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: short pkt (%d)\n", state->unit, isize); return DECOMP_ERROR; } /* Check the sequence number. */ seq = get_unaligned_be16(ibuf + PPP_HDRLEN); if (seq != (state->seqno & 0xffff)) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: bad seq # %d, expected %d\n", state->unit, seq, state->seqno & 0xffff); return DECOMP_ERROR; } ++state->seqno; /* * Fill in the first part of the PPP header. The protocol field * comes from the decompressed data. */ obuf[0] = PPP_ADDRESS(ibuf); obuf[1] = PPP_CONTROL(ibuf); obuf[2] = 0; /* * Set up to call inflate. We set avail_out to 1 initially so we can * look at the first byte of the output and decide whether we have * a 1-byte or 2-byte protocol field. */ state->strm.next_in = ibuf + PPP_HDRLEN + DEFLATE_OVHD; state->strm.avail_in = isize - (PPP_HDRLEN + DEFLATE_OVHD); state->strm.next_out = obuf + 3; state->strm.avail_out = 1; decode_proto = 1; overflow = 0; /* * Call inflate, supplying more input or output as needed. */ for (;;) { r = zlib_inflate(&state->strm, Z_PACKET_FLUSH); if (r != Z_OK) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: inflate returned %d (%s)\n", state->unit, r, (state->strm.msg? state->strm.msg: "")); return DECOMP_FATALERROR; } if (state->strm.avail_out != 0) break; /* all done */ if (decode_proto) { state->strm.avail_out = osize - PPP_HDRLEN; if ((obuf[3] & 1) == 0) { /* 2-byte protocol field */ obuf[2] = obuf[3]; --state->strm.next_out; ++state->strm.avail_out; } decode_proto = 0; } else if (!overflow) { /* * We've filled up the output buffer; the only way to * find out whether inflate has any more characters * left is to give it another byte of output space. */ state->strm.next_out = overflow_buf; state->strm.avail_out = 1; overflow = 1; } else { if (state->debug) printk(KERN_DEBUG "z_decompress%d: ran out of mru\n", state->unit); return DECOMP_FATALERROR; } } if (decode_proto) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: didn't get proto\n", state->unit); return DECOMP_ERROR; } olen = osize + overflow - state->strm.avail_out; state->stats.unc_bytes += olen; state->stats.unc_packets++; state->stats.comp_bytes += isize; state->stats.comp_packets++; return olen; } /** * z_incomp - add incompressible input data to the history. * @arg: pointer to private state for the decompressor * @ibuf: pointer to input packet data * @icnt: length of input data. */ static void z_incomp(void *arg, unsigned char *ibuf, int icnt) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int proto, r; /* * Check that the protocol is one we handle. */ proto = PPP_PROTOCOL(ibuf); if (proto > 0x3fff || proto == 0xfd || proto == 0xfb) return; ++state->seqno; /* * We start at the either the 1st or 2nd byte of the protocol field, * depending on whether the protocol value is compressible. */ state->strm.next_in = ibuf + 3; state->strm.avail_in = icnt - 3; if (proto > 0xff) { --state->strm.next_in; ++state->strm.avail_in; } r = zlib_inflateIncomp(&state->strm); if (r != Z_OK) { /* gak! */ if (state->debug) { printk(KERN_DEBUG "z_incomp%d: inflateIncomp returned %d (%s)\n", state->unit, r, (state->strm.msg? state->strm.msg: "")); } return; } /* * Update stats. */ state->stats.inc_bytes += icnt; state->stats.inc_packets++; state->stats.unc_bytes += icnt; state->stats.unc_packets++; } /************************************************************* * Module interface table *************************************************************/ /* These are in ppp_generic.c */ extern int ppp_register_compressor (struct compressor *cp); extern void ppp_unregister_compressor (struct compressor *cp); /* * Procedures exported to if_ppp.c. */ static struct compressor ppp_deflate = { .compress_proto = CI_DEFLATE, .comp_alloc = z_comp_alloc, .comp_free = z_comp_free, .comp_init = z_comp_init, .comp_reset = z_comp_reset, .compress = z_compress, .comp_stat = z_comp_stats, .decomp_alloc = z_decomp_alloc, .decomp_free = z_decomp_free, .decomp_init = z_decomp_init, .decomp_reset = z_decomp_reset, .decompress = z_decompress, .incomp = z_incomp, .decomp_stat = z_comp_stats, .owner = THIS_MODULE }; static struct compressor ppp_deflate_draft = { .compress_proto = CI_DEFLATE_DRAFT, .comp_alloc = z_comp_alloc, .comp_free = z_comp_free, .comp_init = z_comp_init, .comp_reset = z_comp_reset, .compress = z_compress, .comp_stat = z_comp_stats, .decomp_alloc = z_decomp_alloc, .decomp_free = z_decomp_free, .decomp_init = z_decomp_init, .decomp_reset = z_decomp_reset, .decompress = z_decompress, .incomp = z_incomp, .decomp_stat = z_comp_stats, .owner = THIS_MODULE }; static int __init deflate_init(void) { int rc; rc = ppp_register_compressor(&ppp_deflate); if (rc) return rc; rc = ppp_register_compressor(&ppp_deflate_draft); if (rc) { ppp_unregister_compressor(&ppp_deflate); return rc; } pr_info("PPP Deflate Compression module registered\n"); return 0; } static void __exit deflate_cleanup(void) { ppp_unregister_compressor(&ppp_deflate); ppp_unregister_compressor(&ppp_deflate_draft); } module_init(deflate_init); module_exit(deflate_cleanup); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("ppp-compress-" __stringify(CI_DEFLATE)); MODULE_ALIAS("ppp-compress-" __stringify(CI_DEFLATE_DRAFT));
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, struct timespec64 *time, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct user_namespace *mnt_userns, struct inode *inode, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed);
345 321 53 39 53 231 18 544 13 13 22 14 13 140 140 140 601 232 114 304 101 252 253 305 177 141 228 17 17 17 17 17 8 9 4 11 5 14 2 15 7 22 23 3 17 1 4 1 1 4 16 4 1 21 4 17 1 15 10 2 4 4 2 3 3 34 34 3 19 30 2 1 2 1 20 4 182 2 3 2 171 8 4 8 4 16 12 25 15 18 5 45 21 49 74 82 13 151 11 2 8 1 151 4 17 10 5 2 164 16 147 3 23 6 3 20 20 18 18 15 18 15 20 179 111 91 112 112 17 99 5 31 72 5 78 7 76 90 6 88 2 90 65 27 27 3 3 27 27 4 89 89 68 49 275 194 140 117 50 88 11 903 121 895 416 836 162 199 190 12 60 90 47 47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; main_table = fib_trie_table(RT_TABLE_MAIN, NULL); if (!main_table) return -ENOMEM; local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); hlist_add_head_rcu(&main_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); return 0; fail: fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; tb = fib_get_table(net, id); if (tb) return tb; if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, tb); break; default: break; } h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, struct fib_table *new) { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, new); break; default: break; } #endif /* replace the old table in the hlist */ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); } int fib_unmerge(struct net *net) { struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); if (!old) return 0; new = fib_trie_unmerge(old); if (!new) return -ENOMEM; /* table is already unmerged */ if (new == old) return 0; /* replace merged table with clean table */ fib_replace_table(net, old, new); fib_free_table(old); /* attempt to fetch main table if it has been allocated */ main_table = fib_get_table(net, RT_TABLE_MAIN); if (!main_table) return 0; /* flush local entries from main table */ fib_table_flush_external(main_table); return 0; } void fib_flush(struct net *net) { int flushed = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(net, tb, false); } if (flushed) rt_cache_flush(net); } /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; if (ipv4_is_multicast(addr)) return RTN_MULTICAST; rcu_read_lock(); table = fib_get_table(net, tb_id); if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); if (!dev || dev == nhc->nhc_dev) ret = res.type; } } rcu_read_unlock(); return ret; } unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } EXPORT_SYMBOL(inet_addr_type_table); unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); /* inet_addr_type with dev == NULL but using the table from a dev * if one is associated */ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } EXPORT_SYMBOL(inet_addr_type_dev_table); __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct rtable *rt; struct net *net; int scope; rt = skb_rtable(skb); if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (unlikely(fi->nh)) { dev_match = nexthop_uses_dev(fi->nh, dev); } else { int ret; for (ret = 0; ret < fib_info_num_path(fi); ret++) { const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); if (nhc_l3mdev_matches_dev(nhc, dev)) { dev_match = true; break; } } } #else if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif return dev_match; } EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } else { swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); /* This is not common, loopback packets retain skb_dst so normally they * would not even hit this slow path. */ dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; return ret; } if (no_addr) goto last_resort; if (rpf == 1) goto e_rpf; fl4.flowi4_oif = dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; } return ret; last_resort: if (rpf) goto e_rpf; *itag = 0; return 0; e_inval: return -EINVAL; e_rpf: return -EXDEV; } /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); struct net *net = dev_net(dev); if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; /* with custom local routes in place, checking local addresses * only will be too optimistic, with custom rules, checking * local addresses only can be too strict, e.g. due to vrf */ if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; if (inet_lookup_ifaddr_rcu(net, src)) return -EINVAL; ok: *itag = 0; return 0; } full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; } static int put_rtax(struct nlattr *mx, int len, int type, u32 value) { struct nlattr *nla; nla = (struct nlattr *) ((char *) mx + len); nla->nla_type = type; nla->nla_len = nla_attr_size(4); *(u32 *) nla_data(nla) = value; return len + nla_total_size(4); } static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Check mask for validity: * a) it must be contiguous. * b) destination must have all host bits clear. * c) if application forgot to set correct family (AF_INET), * reject request unless it is absolutely clear i.e. * both family and mask are zero. */ plen = 32; addr = sk_extract_addr(&rt->rt_dst); if (!(rt->rt_flags & RTF_HOST)) { __be32 mask = sk_extract_addr(&rt->rt_genmask); if (rt->rt_genmask.sa_family != AF_INET) { if (mask || rt->rt_genmask.sa_family) return -EAFNOSUPPORT; } if (bad_mask(mask, addr)) return -EINVAL; plen = inet_mask_len(mask); } cfg->fc_dst_len = plen; cfg->fc_dst = addr; if (cmd != SIOCDELRT) { cfg->fc_nlflags = NLM_F_CREATE; cfg->fc_protocol = RTPROT_BOOT; } if (rt->rt_metric) cfg->fc_priority = rt->rt_metric - 1; if (rt->rt_flags & RTF_REJECT) { cfg->fc_scope = RT_SCOPE_HOST; cfg->fc_type = RTN_UNREACHABLE; return 0; } cfg->fc_scope = RT_SCOPE_NOWHERE; cfg->fc_type = RTN_UNICAST; if (rt->rt_dev) { char *colon; struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; colon = strchr(devname, ':'); if (colon) *colon = 0; dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { const struct in_ifaddr *ifa; struct in_device *in_dev; in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return -ENODEV; *colon = ':'; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } rcu_read_unlock(); if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } } addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; cfg->fc_gw4 = addr; cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; if (cmd == SIOCDELRT) return 0; if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) cfg->fc_scope = RT_SCOPE_LINK; if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { struct nlattr *mx; int len = 0; mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); if (rt->rt_flags & RTF_WINDOW) len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); if (rt->rt_flags & RTF_IRTT) len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); cfg->fc_mx = mx; cfg->fc_mx_len = len; } return 0; } /* * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; int err; switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) err = fib_table_delete(net, tb, &cfg, NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) err = fib_table_insert(net, tb, &cfg, NULL); else err = -ENOBUFS; } /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } rtnl_unlock(); return err; } return -EINVAL; } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_GATEWAY] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack) { struct rtvia *via; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); return -EINVAL; } via = nla_data(nla); alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (alen != sizeof(__be32)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET6; cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); #else NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EINVAL; #endif break; default: NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); return -EINVAL; } return 0; } static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) goto errout; memset(cfg, 0, sizeof(*cfg)); rtm = nlmsg_data(nlh); cfg->fc_dst_len = rtm->rtm_dst_len; cfg->fc_tos = rtm->rtm_tos; cfg->fc_table = rtm->rtm_table; cfg->fc_protocol = rtm->rtm_protocol; cfg->fc_scope = rtm->rtm_scope; cfg->fc_type = rtm->rtm_type; cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; case RTA_OIF: cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: has_gw = true; cfg->fc_gw4 = nla_get_be32(attr); if (cfg->fc_gw4) cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; err = fib_gw_from_via(cfg, attr, extack); if (err) goto errout; break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; case RTA_PREFSRC: cfg->fc_prefsrc = nla_get_be32(attr); break; case RTA_METRICS: cfg->fc_mx = nla_data(attr); cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; case RTA_FLOW: cfg->fc_flow = nla_get_u32(attr); break; case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; case RTA_ENCAP: cfg->fc_encap = attr; break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; break; case RTA_NH_ID: cfg->fc_nh_id = nla_get_u32(attr); break; } } if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); return -EINVAL; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; return 0; errout: return err; } static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; goto errout; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto errout; } err = fib_table_delete(net, tb, &cfg, extack); errout: return err; } static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; goto errout; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; errout: return err; } int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; ASSERT_RTNL(); if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & RTM_F_CLONED) filter->dump_routes = false; else filter->dump_exceptions = false; filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (!tb[i]) continue; switch (i) { case RTA_TABLE: filter->table_id = nla_get_u32(tb[i]); break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); filter->dev = __dev_get_by_index(net, ifindex); if (!filter->dev) return -ENODEV; break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } if (filter->flags || filter->protocol || filter->rt_type || filter->table_id || filter->dev) { filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } return 0; } EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err; if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) return err; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) return skb->len; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) return skb->len; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); return -ENOENT; } rcu_read_lock(); err = fib_table_dump(tb, skb, cb, &filter); rcu_read_unlock(); return skb->len ? : err; } s_h = cb->args[0]; s_e = cb->args[1]; rcu_read_lock(); for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) { if (likely(skb->len)) goto out; goto out_err; } dumped = 1; next: e++; } } out: err = skb->len; out_err: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; return err; } /* Prepare and feed intra-kernel routing request. * Really, it should be netlink message, but :-( netlink * can be not configured, so that we feed it directly * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, .fc_nlinfo = { .nl_net = net, }, }; if (!tb_id) tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; tb = fib_new_table(net, tb_id); if (!tb) return; cfg.fc_table = tb->tb_id; if (type != RTN_LOCAL) cfg.fc_scope = RT_SCOPE_LINK; else cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; __be32 prefix = ifa->ifa_address & mask; if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); arp_invalidate(dev, ifa->ifa_broadcast, false); } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); arp_invalidate(dev, prefix | ~mask, false); } } } void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) { __be32 prefix = ifa->ifa_address & ifa->ifa_mask; struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, new_metric); /* delete the old */ fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); } /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. * In this case the secondary ifa can be in device list. */ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { /* if the device has been deleted, we don't perform * address promotion */ if (!in_dev->dead) pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { pr_warn("%s: bug: iprim != prim\n", __func__); return; } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } if (in_dev->dead) goto no_promotions; /* Deletion is more complicated than add. * We should take care of not to delete too much :-) * * Scan address list to be sure that addresses are really gone. */ rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; continue; } /* Ignore IFAs from our subnet */ if (iprim && ifa1->ifa_mask == iprim->ifa_mask && inet_ifa_match(ifa1->ifa_address, iprim)) continue; /* Ignore ifa1 if it uses different primary IP (prefsrc) */ if (ifa1->ifa_flags & IFA_F_SECONDARY) { /* Another address from our subnet? */ if (ifa1->ifa_mask == prim->ifa_mask && inet_ifa_match(ifa1->ifa_address, prim)) prim1 = prim; else { /* We reached the secondaries, so * same_prefsrc should be determined. */ if (!same_prefsrc) continue; /* Search new prim1 if ifa1 is not * using the current prim1 */ if (!prim1 || ifa1->ifa_mask != prim1->ifa_mask || !inet_ifa_match(ifa1->ifa_address, prim1)) prim1 = inet_ifa_byprefix(in_dev, ifa1->ifa_address, ifa1->ifa_mask); if (!prim1) continue; if (prim1->ifa_local != prim->ifa_local) continue; } } else { if (prim->ifa_local != ifa1->ifa_local) continue; prim1 = ifa1; if (prim != prim1) same_prefsrc = 1; } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) ok |= BRD_OK; if (brd == ifa1->ifa_broadcast) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; /* primary has network specific broadcasts */ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; if (!ipv4_is_zeronet(any1)) { if (ifa->ifa_broadcast == brd1 || ifa->ifa_broadcast == any1) ok |= BRD_OK; if (brd == brd1 || brd == any1) ok |= BRD1_OK; if (any == brd1 || any == any1) ok |= BRD0_OK; } } } rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, 0); if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, ifa->ifa_local); if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } #undef LOCAL_OK #undef BRD_OK #undef BRD0_OK #undef BRD1_OK } static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, .flowi4_tos = frn->fl_tos & IPTOS_RT_MASK, .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; rcu_read_lock(); tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; } local_bh_enable(); } rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) { struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < nlmsg_total_size(sizeof(*frn)) || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; skb = netlink_skb_clone(skb, GFP_KERNEL); if (!skb) return; nlh = nlmsg_hdr(skb); frn = (struct fib_result_nl *) nlmsg_data(nlh); nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .input = nl_fib_input, }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; } static void nl_fib_lookup_exit(struct net *net) { netlink_kernel_release(net->ipv4.fibnl); net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, unsigned long event, bool force) { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); else rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ fib_disable_ip(dev, event, true); } else { rt_cache_flush(dev_net(dev)); } break; } return NOTIFY_DONE; } static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *upper_info = ptr; struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); rt_cache_flush(net); break; case NETDEV_CHANGEMTU: fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ if (upper_info->upper_dev && netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } return NOTIFY_DONE; } static struct notifier_block fib_inetaddr_notifier = { .notifier_call = fib_inetaddr_event, }; static struct notifier_block fib_netdev_notifier = { .notifier_call = fib_netdev_event, }; static int __net_init ip_fib_net_init(struct net *net) { int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; err = fib4_notifier_init(net); if (err) return err; #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Default to 3-tuple */ net->ipv4.sysctl_fib_multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; #endif /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv4.fib_table_hash) { err = -ENOMEM; goto err_table_hash_alloc; } err = fib4_rules_init(net); if (err < 0) goto err_rules_init; return 0; err_rules_init: kfree(net->ipv4.fib_table_hash); err_table_hash_alloc: fib4_notifier_exit(net); return err; } static void ip_fib_net_exit(struct net *net) { int i; rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif /* Destroy the tables in reverse order to guarantee that the * local table, ID 255, is destroyed before the main table, ID * 254. This is necessary as the local table may contain * references to data contained in the main table. */ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(net, tb, true); fib_free_table(tb); } } #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif rtnl_unlock(); kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) { int error; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) goto out; error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; error = fib_proc_init(net); if (error < 0) goto out_proc; out: return error; out_proc: nl_fib_lookup_exit(net); out_nlfl: ip_fib_net_exit(net); goto out; } static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); ip_fib_net_exit(net); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, }; void __init ip_fib_init(void) { fib_trie_init(); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0); }
5 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #ifndef _WG_QUEUEING_H #define _WG_QUEUEING_H #include "peer.h" #include <linux/types.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip_tunnels.h> struct wg_device; struct wg_peer; struct multicore_worker; struct crypt_queue; struct prev_queue; struct sk_buff; /* queueing.c APIs: */ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, unsigned int len); void wg_packet_queue_free(struct crypt_queue *queue, bool purge); struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); /* receive.c APIs: */ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb); void wg_packet_handshake_receive_worker(struct work_struct *work); /* NAPI poll function: */ int wg_packet_rx_poll(struct napi_struct *napi, int budget); /* Workqueue worker: */ void wg_packet_decrypt_worker(struct work_struct *work); /* send.c APIs: */ void wg_packet_send_queued_handshake_initiation(struct wg_peer *peer, bool is_retry); void wg_packet_send_handshake_response(struct wg_peer *peer); void wg_packet_send_handshake_cookie(struct wg_device *wg, struct sk_buff *initiating_skb, __le32 sender_index); void wg_packet_send_keepalive(struct wg_peer *peer); void wg_packet_purge_staged_packets(struct wg_peer *peer); void wg_packet_send_staged_packets(struct wg_peer *peer); /* Workqueue workers: */ void wg_packet_handshake_send_worker(struct work_struct *work); void wg_packet_tx_worker(struct work_struct *work); void wg_packet_encrypt_worker(struct work_struct *work); enum packet_state { PACKET_STATE_UNCRYPTED, PACKET_STATE_CRYPTED, PACKET_STATE_DEAD }; struct packet_cb { u64 nonce; struct noise_keypair *keypair; atomic_t state; u32 mtu; u8 ds; }; #define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb)) #define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer) static inline bool wg_check_packet_protocol(struct sk_buff *skb) { __be16 real_protocol = ip_tunnel_parse_protocol(skb); return real_protocol && skb->protocol == real_protocol; } static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating) { u8 l4_hash = skb->l4_hash; u8 sw_hash = skb->sw_hash; u32 hash = skb->hash; skb_scrub_packet(skb, true); memset(&skb->headers_start, 0, offsetof(struct sk_buff, headers_end) - offsetof(struct sk_buff, headers_start)); if (encapsulating) { skb->l4_hash = l4_hash; skb->sw_hash = sw_hash; skb->hash = hash; } skb->queue_mapping = 0; skb->nohdr = 0; skb->peeked = 0; skb->mac_len = 0; skb->dev = NULL; #ifdef CONFIG_NET_SCHED skb->tc_index = 0; #endif skb_reset_redirect(skb); skb->hdr_len = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_probe_transport_header(skb); skb_reset_inner_headers(skb); } static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) { unsigned int cpu = *stored_cpu, cpu_index, i; if (unlikely(cpu == nr_cpumask_bits || !cpumask_test_cpu(cpu, cpu_online_mask))) { cpu_index = id % cpumask_weight(cpu_online_mask); cpu = cpumask_first(cpu_online_mask); for (i = 0; i < cpu_index; ++i) cpu = cpumask_next(cpu, cpu_online_mask); *stored_cpu = cpu; } return cpu; } /* This function is racy, in the sense that it's called while last_cpu is * unlocked, so it could return the same CPU twice. Adding locking or using * atomic sequence numbers is slower though, and the consequences of racing are * harmless, so live with it. */ static inline int wg_cpumask_next_online(int *last_cpu) { int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); WRITE_ONCE(*last_cpu, cpu); return cpu; } void wg_prev_queue_init(struct prev_queue *queue); /* Multi producer */ bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); /* Single consumer */ struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); /* Single consumer */ static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) { if (queue->peeked) return queue->peeked; queue->peeked = wg_prev_queue_dequeue(queue); return queue->peeked; } /* Single consumer */ static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) { queue->peeked = NULL; } static inline int wg_queue_enqueue_per_device_and_peer( struct crypt_queue *device_queue, struct prev_queue *peer_queue, struct sk_buff *skb, struct workqueue_struct *wq) { int cpu; atomic_set_release(&PACKET_CB(skb)->state, PACKET_STATE_UNCRYPTED); /* We first queue this up for the peer ingestion, but the consumer * will wait for the state to change to CRYPTED or DEAD before. */ if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) return -ENOSPC; /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ cpu = wg_cpumask_next_online(&device_queue->last_cpu); if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb))) return -EPIPE; queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work); return 0; } static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), peer->device->packet_crypt_wq, &peer->transmit_packet_work); wg_peer_put(peer); } static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); napi_schedule(&peer->napi); wg_peer_put(peer); } #ifdef DEBUG bool wg_packet_counter_selftest(void); #endif #endif /* _WG_QUEUEING_H */
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_H #define BLK_MQ_H #include <linux/blkdev.h> #include <linux/sbitmap.h> #include <linux/srcu.h> #include <linux/lockdep.h> struct blk_mq_tags; struct blk_flush_queue; /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device */ struct blk_mq_hw_ctx { struct { /** @lock: Protects the dispatch list. */ spinlock_t lock; /** * @dispatch: Used for requests that are ready to be * dispatched to the hardware but for some reason (e.g. lack of * resources) could not be sent to the hardware. As soon as the * driver can send new requests, requests at this list will * be sent first for a fairer dispatch. */ struct list_head dispatch; /** * @state: BLK_MQ_S_* flags. Defines the state of the hw * queue (active, scheduled to restart, stopped). */ unsigned long state; } ____cacheline_aligned_in_smp; /** * @run_work: Used for scheduling a hardware queue run at a later time. */ struct delayed_work run_work; /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; /** * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU * selection from @cpumask. */ int next_cpu; /** * @next_cpu_batch: Counter of how many works left in the batch before * changing to the next CPU. */ int next_cpu_batch; /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ unsigned long flags; /** * @sched_data: Pointer owned by the IO scheduler attached to a request * queue. It's up to the IO scheduler how to use this pointer. */ void *sched_data; /** * @queue: Pointer to the request queue that owns this hardware context. */ struct request_queue *queue; /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; /** * @driver_data: Pointer to data owned by the block driver that created * this hctx */ void *driver_data; /** * @ctx_map: Bitmap for each software queue. If bit is on, there is a * pending request in that software queue. */ struct sbitmap ctx_map; /** * @dispatch_from: Software queue to be used when no scheduler was * selected. */ struct blk_mq_ctx *dispatch_from; /** * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to * decide if the hw_queue is busy using Exponential Weighted Moving * Average algorithm. */ unsigned int dispatch_busy; /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; /** * @dispatch_wait: Waitqueue to put requests when there is no tag * available at the moment, to wait for another try in the future. */ wait_queue_entry_t dispatch_wait; /** * @wait_index: Index of next available dispatch_wait queue to insert * requests. */ atomic_t wait_index; /** * @tags: Tags owned by the block driver. A tag at this set is only * assigned when a request is dispatched from a hardware queue. */ struct blk_mq_tags *tags; /** * @sched_tags: Tags owned by I/O scheduler. If there is an I/O * scheduler associated with a request queue, a tag is assigned when * that request is allocated. Else, this member is not used. */ struct blk_mq_tags *sched_tags; /** @queued: Number of queued requests. */ unsigned long queued; /** @run: Number of dispatched requests. */ unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 /** @dispatched: Number of dispatch requests by queue. */ unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; /** * @nr_active: Number of active requests. Only used when a tag set is * shared across request queues. */ atomic_t nr_active; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ struct kobject kobj; /** @poll_considered: Count times blk_poll() was called. */ unsigned long poll_considered; /** @poll_invoked: Count how many requests blk_poll() polled. */ unsigned long poll_invoked; /** @poll_success: Count how many polled requests were completed. */ unsigned long poll_success; #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named * as cpu<cpu_number>. */ struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif /** * @hctx_list: if this hctx is not in use, this is an entry in * q->unused_hctx_list. */ struct list_head hctx_list; /** * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also * blk_mq_hw_ctx_size(). */ struct srcu_struct srcu[]; }; /** * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). * @nr_queues: Number of hardware queues to map CPU IDs onto. * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe * driver to map each hardware queue type (enum hctx_type) onto a distinct * set of hardware queues. */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; unsigned int queue_offset; }; /** * enum hctx_type - Type of hardware queue * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. * @HCTX_TYPE_READ: Just for READ I/O. * @HCTX_TYPE_POLL: Polled I/O of any kind. * @HCTX_MAX_TYPES: Number of types of hctx. */ enum hctx_type { HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; /** * struct blk_mq_tag_set - tag set that can be shared between request queues * @map: One or more ctx -> hctx mappings. One map exists for each * hardware queue type (enum hctx_type) that the driver wishes * to support. There are no restrictions on maps being of the * same size, and it's perfectly legal to share maps between * types. * @nr_maps: Number of elements in the @map array. A number in the range * [1, HCTX_MAX_TYPES]. * @ops: Pointers to functions that implement block driver behavior. * @nr_hw_queues: Number of hardware queues supported by the block driver that * owns this data structure. * @queue_depth: Number of tags per hardware queue, reserved tags included. * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag * allocations. * @cmd_size: Number of additional bytes to allocate per request. The block * driver owns these additional bytes. * @numa_node: NUMA node the storage adapter has been connected to. * @timeout: Request processing timeout in jiffies. * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. * @active_queues_shared_sbitmap: * number of active request queues per tag set. * @__bitmap_tags: A shared tags sbitmap, used over all hctx's * @__breserved_tags: * A shared reserved tags sbitmap, used over all hctx's * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. */ struct blk_mq_tag_set { struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsigned int nr_maps; const struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; unsigned int reserved_tags; unsigned int cmd_size; int numa_node; unsigned int timeout; unsigned int flags; void *driver_data; atomic_t active_queues_shared_sbitmap; struct sbitmap_queue __bitmap_tags; struct sbitmap_queue __breserved_tags; struct blk_mq_tags **tags; struct mutex tag_list_lock; struct list_head tag_list; }; /** * struct blk_mq_queue_data - Data about a request inserted in a queue * * @rq: Request pointer. * @last: If it is the last request in the queue. */ struct blk_mq_queue_data { struct request *rq; bool last; }; typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); /** * struct blk_mq_ops - Callback functions that implements block driver * behaviour. */ struct blk_mq_ops { /** * @queue_rq: Queue a new request from block IO. */ blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); /** * @commit_rqs: If a driver uses bd->last to judge when to submit * requests to hardware, it must define this function. In case of errors * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ void (*commit_rqs)(struct blk_mq_hw_ctx *); /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ int (*get_budget)(struct request_queue *); /** * @put_budget: Release the reserved budget. */ void (*put_budget)(struct request_queue *, int); /** * @set_rq_budget_token: store rq's budget token */ void (*set_rq_budget_token)(struct request *, int); /** * @get_rq_budget_token: retrieve rq's budget token */ int (*get_rq_budget_token)(struct request *); /** * @timeout: Called on request timeout. */ enum blk_eh_timer_return (*timeout)(struct request *, bool); /** * @poll: Called to poll for completion of a specific tag. */ int (*poll)(struct blk_mq_hw_ctx *); /** * @complete: Mark the request as complete. */ void (*complete)(struct request *); /** * @init_hctx: Called when the block layer side of a hardware queue has * been set up, allowing the driver to allocate/init matching * structures. */ int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); /** * @exit_hctx: Ditto for exit/teardown. */ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); /** * @init_request: Called for every command allocated by the block layer * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. */ int (*init_request)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); /** * @exit_request: Ditto for exit/teardown. */ void (*exit_request)(struct blk_mq_tag_set *set, struct request *, unsigned int); /** * @initialize_rq_fn: Called from inside blk_get_request(). */ void (*initialize_rq_fn)(struct request *rq); /** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data. */ void (*cleanup_rq)(struct request *); /** * @busy: If set, returns whether or not this queue currently is busy. */ bool (*busy)(struct request_queue *); /** * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ int (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); #endif }; enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO: */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE = 3, BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, queuedata) \ ({ \ static struct lock_class_key __key; \ \ __blk_mq_alloc_disk(set, queuedata, &__key); \ }) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_free_request(struct request *rq); bool blk_mq_queue_inflight(struct request_queue *q); enum { /* return when out of requests */ BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), /* allocate from reserved pool */ BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), /* set RQF_PM */ BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { BLK_MQ_UNIQUE_TAG_BITS = 16, BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, }; u32 blk_mq_unique_tag(struct request *rq); static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) { return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; } static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) { return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } /** * blk_mq_rq_state() - read the current MQ_RQ_* state of a request * @rq: target request. */ static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) { return READ_ONCE(rq->state); } static inline int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } static inline int blk_mq_request_completed(struct request *rq) { return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; } /* * * Set the state to complete when completing a request from inside ->queue_rq. * This is used by drivers that want to ensure special complete actions that * need access to the request are called on failure, e.g. by nvme for * multipathing. */ static inline void blk_mq_set_request_complete(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); int blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); bool __blk_should_fake_timeout(struct request_queue *q); static inline bool blk_should_fake_timeout(struct request_queue *q) { if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) return __blk_should_fake_timeout(q); return false; } /** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted * * Return: request * * Driver command data is immediately after the request. So subtract request * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } /** * blk_mq_rq_to_pdu - cast a request to a PDU * @rq: the request to be casted * * Return: pointer to the PDU * * Driver command data is immediately after the request. So add request to get * the PDU. */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; } #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) { if (rq->tag != -1) return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | BLK_QC_T_INTERNAL; } static inline void blk_mq_cleanup_rq(struct request *rq) { if (rq->q->mq_ops->cleanup_rq) rq->q->mq_ops->cleanup_rq(rq); } static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs) { rq->nr_phys_segments = nr_segs; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); if (bio->bi_bdev) rq->rq_disk = bio->bi_bdev->bd_disk; } blk_qc_t blk_mq_submit_bio(struct bio *bio); void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); #endif
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 /* SPDX-License-Identifier: GPL-2.0 */ /* * Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/types.h> struct seq_file; /* MPTCP sk_buff extension data */ struct mptcp_ext { union { u64 data_ack; u32 data_ack32; }; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, frozen:1, reset_transient:1; u8 reset_reason:4, csum_reqd:1; }; #define MPTCP_RM_IDS_MAX 8 struct mptcp_rm_list { u8 ids[MPTCP_RM_IDS_MAX]; u8 nr; }; struct mptcp_addr_info { u8 id; sa_family_t family; __be16 port; union { struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct in6_addr addr6; #endif }; }; struct mptcp_out_options { #if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; struct mptcp_rm_list rm_list; u8 join_id; u8 backup; u8 reset_reason:4, reset_transient:1, csum_reqd:1, allow_join_id0:1; union { struct { u64 sndr_key; u64 rcvr_key; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; }; struct { struct mptcp_addr_info addr; u64 ahmac; }; struct { struct mptcp_ext ext_copy; u64 fail_seq; }; struct { u32 nonce; u32 token; u64 thmac; u8 hmac[20]; }; }; #endif }; #ifdef CONFIG_MPTCP void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) { return tcp_sk(sk)->is_mptcp; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp; } static inline bool rsk_drop_req(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req; } void mptcp_space(const struct sock *ssk, int *space, int *full_space); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts); /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ static inline void mptcp_skb_ext_move(struct sk_buff *to, struct sk_buff *from) { if (!skb_ext_exist(from, SKB_EXT_MPTCP)) return; if (WARN_ON_ONCE(to->active_extensions)) skb_ext_put(to); to->active_extensions = from->active_extensions; to->extensions = from->extensions; from->active_extensions = 0; } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { struct mptcp_ext *from_ext; from_ext = skb_ext_find(from, SKB_EXT_MPTCP); if (!from_ext) return; from_ext->frozen = 1; skb_ext_copy(to, from); } static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { /* MPTCP always clears the ext when adding it to the skb, so * holes do not bother us here */ return !from_ext || (to_ext && from_ext && !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); } /* check if skbs can be collapsed. * MPTCP collapse is allowed if neither @to or @from carry an mptcp data * mapping, or if the extension of @to is the same as @from. * Collapsing is not possible if @to lacks an extension, but @from carries one. */ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), skb_ext_find(from, SKB_EXT_MPTCP)); } void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); __be32 mptcp_get_reset_option(const struct sk_buff *skb); static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { if (skb_ext_exist(skb, SKB_EXT_MPTCP)) return mptcp_get_reset_option(skb); return htonl(0u); } #else static inline void mptcp_init(void) { } static inline bool sk_is_mptcp(const struct sock *sk) { return false; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return false; } static inline bool rsk_drop_req(const struct request_sock *req) { return false; } static inline void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { return true; } static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { } static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return true; } static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { return 0; /* TCP fallback */ } static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { return NULL; } static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); void mptcpv6_handle_mapped(struct sock *sk, bool mapped); #elif IS_ENABLED(CONFIG_IPV6) static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif #endif /* __NET_MPTCP_H */
3174 1081 2712 698 75 366 479 7 23 4 159 2103 856 14 14 42 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_H #define _LINUX_RCULIST_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list.h> #include <linux/rcupdate.h> /* * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers * @list: list to be initialized * * You should instead use INIT_LIST_HEAD() for normal initialization and * cleanup tasks, when readers have no access to the list being initialized. * However, if the list being initialized is visible to readers, you * need to keep the compiler from being too mischievous. */ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list * * Note: This should only be used with the list header, and even then * only if list_del() and similar primitives are not also used on the * list header. */ #define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) /* * Check during list traversal that we are within an RCU reader */ #define check_arg_count_one(dummy) #ifdef CONFIG_PROVE_RCU_LIST #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) #define __list_check_srcu(cond) \ ({ \ RCU_LOCKDEP_WARN(!(cond), \ "RCU-list traversed without holding the required lock!");\ }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) #define __list_check_srcu(cond) ({ }) #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } /** * list_add_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head, head->next); } /** * list_add_tail_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_tail_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_tail_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head->prev, head); } /** * list_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * Note: list_empty() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_del_rcu() * or list_add_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_del_rcu(struct list_head *entry) { __list_del_entry(entry); entry->prev = LIST_POISON2; } /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_add_head_rcu() or * hlist_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_for_each_entry_rcu(). */ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * list_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, struct list_head *new) { new->next = old->next; new->prev = old->prev; rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice * @prev: points to the last element of the existing list * @next: points to the first element of the existing list * @sync: synchronize_rcu, synchronize_rcu_expedited, ... * * The list pointed to by @prev and @next can be RCU-read traversed * concurrently with this function. * * Note that this function blocks. * * Important note: the caller must take whatever action is necessary to prevent * any other updates to the existing list. In principle, it is possible to * modify the list as soon as sync() begins execution. If this sort of thing * becomes necessary, an alternative version based on call_rcu() could be * created. But only if -really- needed -- there is no shortage of RCU API * members. */ static inline void __list_splice_init_rcu(struct list_head *list, struct list_head *prev, struct list_head *next, void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; /* * "first" and "last" tracking list, so initialize it. RCU readers * have access to this list, so we must use INIT_LIST_HEAD_RCU() * instead of INIT_LIST_HEAD(). */ INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. * Wait for any readers to finish using the list before splicing * the list body into the new list. Any new readers will see * an empty list. */ sync(); ASSERT_EXCLUSIVE_ACCESS(*first); ASSERT_EXCLUSIVE_ACCESS(*last); /* * Readers are finished with the source list, so perform splice. * The order is important if the new list is global and accessible * to concurrent RCU readers. Note that RCU readers are not * permitted to traverse the prev pointers without excluding * this function. */ last->next = next; rcu_assign_pointer(list_next_rcu(prev), first); first->prev = prev; next->prev = last; } /** * list_splice_init_rcu - splice an RCU-protected list into an existing list, * designed for stacks. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head, head->next, sync); } /** * list_splice_tail_init_rcu - splice an RCU-protected list into an existing * list, designed for queues. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_tail_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head->prev, head, sync); } /** * list_entry_rcu - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? * * They do not exist because they would lead to subtle race conditions: * * if (!list_empty_rcu(mylist)) { * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); * do_something(bar); * } * * The list might be non-empty when list_empty_rcu() checks it, but it * might have become empty by the time that list_first_entry_rcu() rereads * the ->next pointer, which would result in a SEGV. * * When not using RCU, it is OK for list_first_entry() to re-read that * pointer because both functions should be protected by some lock that * blocks writers. * * When using RCU, list_empty() uses READ_ONCE() to fetch the * RCU-protected ->next pointer and then compares it to the address of the * list head. However, it neither dereferences this pointer nor provides * this pointer to its caller. Thus, READ_ONCE() suffices (that is, * rcu_dereference() is not needed), which means that list_empty() can be * used anywhere you would want to use list_empty_rcu(). Just don't * expect anything useful to happen if you do a subsequent lockless * call to list_first_entry_rcu()!!! * * See list_first_or_null_rcu for an alternative. */ /** * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_first_or_null_rcu(ptr, type, member) \ ({ \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) /** * list_next_or_null_rcu - get the first element from a list * @head: the head for the list. * @ptr: the list head to take the next element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the ptr is at the end of the list, NULL is returned. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_next_or_null_rcu(head, ptr, type, member) \ ({ \ struct list_head *__head = (head); \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__next != __head) ? list_entry_rcu(__next, type, \ member) : NULL; \ }) /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define list_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_entry_lockless(ptr, type, member) \ container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_for_each_entry_lockless(pos, head, member) \ for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position which must have been in the list when the RCU read * lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_from_rcu() except * this starts after the given position and that one starts at the given * position. */ #define list_for_each_entry_continue_rcu(pos, head, member) \ for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_from_rcu - iterate over a list from current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_node within the struct. * * Iterate over the tail of a list starting from a given position, * which must have been in the list when the RCU read lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_continue_rcu() except * this starts from the given position and that one starts from the position * after the given position. */ #define list_for_each_entry_from_rcu(pos, head, member) \ for (; &(pos)->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member)) /** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry(). */ static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) { struct hlist_node *next = old->next; new->next = next; WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) WRITE_ONCE(new->next->pprev, &new->next); WRITE_ONCE(old->pprev, LIST_POISON2); } /** * hlists_swap_heads_rcu - swap the lists the hlist heads point to * @left: The hlist head on the left * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right) { struct hlist_node *node1 = left->first; struct hlist_node *node2 = right->first; rcu_assign_pointer(left->first, node2); rcu_assign_pointer(right->first, node1); WRITE_ONCE(node2->pprev, &left->first); WRITE_ONCE(node1->pprev, &right->first); } /* * return the first or the next element in an RCU protected hlist */ #define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) #define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) #define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) /** * hlist_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_tail_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; i; i = i->next) last = i; if (last) { n->next = last->next; WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); } } /** * hlist_add_before_rcu * @n: the new element to add to the hash list. * @next: the existing element to add the new element before. * * Description: * Adds the specified element to the specified hlist * before the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); WRITE_ONCE(next->pprev, &n->next); } /** * hlist_add_behind_rcu * @n: the new element to add to the hash list. * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist * after the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos; \ pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define hlist_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). * * This is the same as hlist_for_each_entry_rcu() except that it does * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu_bh(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu_bh(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from_rcu(pos, member) \ for (; pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) #endif /* __KERNEL__ */ #endif
146 146 146 146 146 5 7 7 7 7 7 7 11 10 11 3 1 7 11 1 1 144 144 144 144 144 129 1 1 124 124 59 85 141 129 140 110 118 13 116 13 117 118 130 1 124 124 114 117 131 128 3 3 24 20 4 3 4 3 3 54 56 56 63 3 62 60 62 5 56 60 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc base directory handling functions * * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. * Instead of using magical inumbers to determine the kind of object * we allocate and fill in-core inodes upon lookup. They don't even * go into icache. We cache the reference to task_struct upon lookup too. * Eventually it should become a filesystem in its own. We don't use the * rest of procfs anymore. * * * Changelog: * 17-Jan-2005 * Allan Bezerra * Bruna Moreira <bruna.moreira@indt.org.br> * Edjard Mota <edjard.mota@indt.org.br> * Ilias Biris <ilias.biris@indt.org.br> * Mauricio Lin <mauricio.lin@indt.org.br> * * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * * A new process specific entry (smaps) included in /proc. It shows the * size of rss for each memory area. The maps entry lacks information * about physical memory size (rss) for each mapped file, i.e., * rss information for executables and library files. * This additional information is useful for any tools that need to know * about physical memory consumption for a process specific library. * * Changelog: * 21-Feb-2005 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * Pud inclusion in the page table walking. * * ChangeLog: * 10-Mar-2005 * 10LE Instituto Nokia de Tecnologia - INdT: * A better way to walks through the page table as suggested by Hugh Dickins. * * Simo Piiroinen <simo.piiroinen@nokia.com>: * Smaps information related to shared, private, clean and dirty pages. * * Paul Mundt <paul.mundt@nokia.com>: * Overall revision about smaps. */ #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/task_io_accounting_ops.h> #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> #include <linux/mnt_namespace.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/printk.h> #include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> #include <linux/nsproxy.h> #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" #include "../../lib/kstrtox.h" /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during * each system call not at open time. The reason is that most of * what we wish to check for permissions in /proc varies at runtime. * * The classic example of a problem is opening file descriptors * in /proc for a task before it execs a suid executable. */ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; enum proc_mem_force { PROC_MEM_FORCE_ALWAYS, PROC_MEM_FORCE_PTRACE, PROC_MEM_FORCE_NEVER }; static enum proc_mem_force proc_mem_force_override __ro_after_init = IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : PROC_MEM_FORCE_ALWAYS; static const struct constant_table proc_mem_force_table[] __initconst = { { "always", PROC_MEM_FORCE_ALWAYS }, { "ptrace", PROC_MEM_FORCE_PTRACE }, { "never", PROC_MEM_FORCE_NEVER }, { } }; static int __init early_proc_mem_force_override(char *buf) { if (!buf) return -EINVAL; /* * lookup_constant() defaults to proc_mem_force_override to preseve * the initial Kconfig choice in case an invalid param gets passed. */ proc_mem_force_override = lookup_constant(proc_mem_force_table, buf, proc_mem_force_override); return 0; } early_param("proc_mem.force_override", early_proc_mem_force_override); struct pid_entry { const char *name; unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; }; #define NOD(NAME, MODE, IOP, FOP, OP) { \ .name = (NAME), \ .len = sizeof(NAME) - 1, \ .mode = MODE, \ .iop = IOP, \ .fop = FOP, \ .op = OP, \ } #define DIR(NAME, MODE, iops, fops) \ NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) #define LNK(NAME, get_link) \ NOD(NAME, (S_IFLNK|S_IRWXUGO), \ &proc_pid_link_inode_operations, NULL, \ { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) #define ATTR(LSM, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ { .lsm = LSM }) /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; } return count; } static int get_task_root(struct task_struct *task, struct path *root) { int result = -ENOENT; task_lock(task); if (task->fs) { get_fs_root(task->fs, root); result = 0; } task_unlock(task); return result; } static int proc_cwd_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { task_lock(task); if (task->fs) { get_fs_pwd(task->fs, path); result = 0; } task_unlock(task); put_task_struct(task); } return result; } static int proc_root_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { result = get_task_root(task, path); put_task_struct(task); } return result; } /* * If the user used setproctitle(), we just get the string from * user space at arg_start, and limit it to a maximum of one page. */ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, size_t count, unsigned long pos, unsigned long arg_start) { char *page; int ret, got; if (pos >= PAGE_SIZE) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); if (got > 0) { int len = strnlen(page, got); /* Include the NUL character if it was found */ if (len < got) len++; if (len > pos) { len -= pos; if (len > count) len = count; len -= copy_to_user(buf, page+pos, len); if (!len) len = -EFAULT; ret = len; } } free_page((unsigned long)page); return ret; } static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, size_t count, loff_t *ppos) { unsigned long arg_start, arg_end, env_start, env_end; unsigned long pos, len; char *page, c; /* Check if process spawned far enough to have cmdline. */ if (!mm->env_end) return 0; spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); if (arg_start >= arg_end) return 0; /* * We allow setproctitle() to overwrite the argument * strings, and overflow past the original end. But * only when it overflows into the environment area. */ if (env_start != arg_end || env_end < env_start) env_start = env_end = arg_end; len = env_end - arg_start; /* We're not going to care if "*ppos" has high bits set */ pos = *ppos; if (pos >= len) return 0; if (count > len - pos) count = len - pos; if (!count) return 0; /* * Magical special case: if the argv[] end byte is not * zero, the user has overwritten it with setproctitle(3). * * Possible future enhancement: do this only once when * pos is 0, and set a flag in the 'struct file'. */ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) return get_mm_proctitle(mm, buf, count, pos, arg_start); /* * For the non-setproctitle() case we limit things strictly * to the [arg_start, arg_end[ range. */ pos += arg_start; if (pos < arg_start || pos >= arg_end) return 0; if (count > arg_end - pos) count = arg_end - pos; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; len = 0; while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); got = access_remote_vm(mm, pos, page, size, FOLL_ANON); if (got <= 0) break; got -= copy_to_user(buf, page, got); if (unlikely(!got)) { if (!len) len = -EFAULT; break; } pos += got; buf += got; len += got; count -= got; } free_page((unsigned long)page); return len; } static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, size_t count, loff_t *pos) { struct mm_struct *mm; ssize_t ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); return ret; } static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct task_struct *tsk; ssize_t ret; BUG_ON(*pos < 0); tsk = get_proc_task(file_inode(file)); if (!tsk) return -ESRCH; ret = get_task_cmdline(tsk, buf, count, pos); put_task_struct(tsk); if (ret > 0) *pos += ret; return ret; } static const struct file_operations proc_pid_cmdline_ops = { .read = proc_pid_cmdline_read, .llseek = generic_file_llseek, }; #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol. If that fails, simply return the address. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); return 0; } print0: seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ static int lock_trace(struct task_struct *task) { int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; } static void unlock_trace(struct task_struct *task) { up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE #define MAX_STACK_TRACE_DEPTH 64 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long *entries; int err; /* * The ability to racily run the kernel stack unwinder on a running task * and then observe the unwinder output is scary; while it is useful for * debugging kernel issues, it can also allow an attacker to leak kernel * stack contents. * Doing this in a manner that is at least safe from races would require * some work to ensure that the remote task can not be scheduled; and * even then, this would still expose the unwinder as local attack * surface. * Therefore, this interface is restricted to root. */ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) return -EACCES; entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) return -ENOMEM; err = lock_trace(task); if (!err) { unsigned int i, nr_entries; nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, 0); for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } unlock_trace(task); } kfree(entries); return err; } #endif #ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); return 0; } #endif #ifdef CONFIG_LATENCYTOP static int lstats_show_proc(struct seq_file *m, void *v) { int i; struct inode *inode = m->private; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; seq_printf(m, "%i %li %li", lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; if (!bt) break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); } } put_task_struct(task); return 0; } static int lstats_open(struct inode *inode, struct file *file) { return single_open(file, lstats_show_proc, inode); } static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { struct task_struct *task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; clear_tsk_latency_tracing(task); put_task_struct(task); return count; } static const struct file_operations proc_lstats_operations = { .open = lstats_open, .read = seq_read, .write = lstats_write, .llseek = seq_lseek, .release = single_release, }; #endif static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; long badness; badness = oom_badness(task, totalpages); /* * Special case OOM_SCORE_ADJ_MIN for all others scale the * badness value into [0, 2000] range which we have been * exporting for a long time so userspace might depend on it. */ if (badness != LONG_MIN) points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; seq_printf(m, "%lu\n", points); return 0; } struct limit_names { const char *name; const char *unit; }; static const struct limit_names lnames[RLIM_NLIMITS] = { [RLIMIT_CPU] = {"Max cpu time", "seconds"}, [RLIMIT_FSIZE] = {"Max file size", "bytes"}, [RLIMIT_DATA] = {"Max data size", "bytes"}, [RLIMIT_STACK] = {"Max stack size", "bytes"}, [RLIMIT_CORE] = {"Max core file size", "bytes"}, [RLIMIT_RSS] = {"Max resident set", "bytes"}, [RLIMIT_NPROC] = {"Max processes", "processes"}, [RLIMIT_NOFILE] = {"Max open files", "files"}, [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, [RLIMIT_AS] = {"Max address space", "bytes"}, [RLIMIT_LOCKS] = {"Max file locks", "locks"}, [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, [RLIMIT_NICE] = {"Max nice priority", NULL}, [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, }; /* Display limits for a process */ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned int i; unsigned long flags; struct rlimit rlim[RLIM_NLIMITS]; if (!lock_task_sighand(task, &flags)) return 0; memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); /* * print the file header */ seq_puts(m, "Limit " "Soft Limit " "Hard Limit " "Units \n"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) seq_printf(m, "%-20s ", "unlimited"); else seq_printf(m, "%-20lu ", rlim[i].rlim_max); if (lnames[i].unit) seq_printf(m, "%-10s\n", lnames[i].unit); else seq_putc(m, '\n'); } return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct syscall_info info; u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); else if (info.data.nr < 0) seq_printf(m, "%d 0x%llx 0x%llx\n", info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ /* permission checks */ static int proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; int allowed = 0; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ task = get_proc_task(inode); if (task) { allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; } int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } /* * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, enum proc_hidepid hide_pid_min) { /* * If 'hidpid' mount option is set force a ptrace check, * we indicate that we are using a filesystem syscall * by passing PTRACE_MODE_READ_FSCREDS */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); if (fs_info->hide_pid < hide_pid_min) return true; if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } static int proc_pid_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process * may not stat() a file, it shouldn't be seen * in procfs at all. */ return -ENOENT; } return -EPERM; } return generic_permission(&init_user_ns, inode, mask); } static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); put_task_struct(task); return ret; } static int proc_single_open(struct inode *inode, struct file *filp) { return single_open(filp, proc_single_show, inode); } static const struct file_operations proc_single_file_operations = { .open = proc_single_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); struct mm_struct *mm = ERR_PTR(-ESRCH); if (task) { mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (!IS_ERR_OR_NULL(mm)) { /* ensure this mm_struct can't be freed */ mmgrab(mm); /* but do not pin its memory */ mmput(mm); } } return mm; } static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); if (IS_ERR(mm)) return PTR_ERR(mm); file->private_data = mm; return 0; } static int mem_open(struct inode *inode, struct file *file) { int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; return ret; } static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) { struct task_struct *task; bool ptrace_active = false; switch (proc_mem_force_override) { case PROC_MEM_FORCE_NEVER: return false; case PROC_MEM_FORCE_PTRACE: task = get_proc_task(file_inode(file)); if (task) { ptrace_active = READ_ONCE(task->ptrace) && READ_ONCE(task->mm) == mm && READ_ONCE(task->parent) == current; put_task_struct(task); } return ptrace_active; default: return true; } } static ssize_t mem_rw(struct file *file, char __user *buf, size_t count, loff_t *ppos, int write) { struct mm_struct *mm = file->private_data; unsigned long addr = *ppos; ssize_t copied; char *page; unsigned int flags; if (!mm) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; copied = 0; if (!mmget_not_zero(mm)) goto free; flags = write ? FOLL_WRITE : 0; if (proc_mem_foll_force(file, mm)) flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; break; } if (!write && copy_to_user(buf, page, this_len)) { copied = -EFAULT; break; } buf += this_len; addr += this_len; copied += this_len; count -= this_len; } *ppos = addr; mmput(mm); free: free_page((unsigned long) page); return copied; } static ssize_t mem_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, buf, count, ppos, 0); } static ssize_t mem_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, (char __user*)buf, count, ppos, 1); } loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: file->f_pos = offset; break; case 1: file->f_pos += offset; break; default: return -EINVAL; } force_successful_syscall_return(); return file->f_pos; } static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, .release = mem_release, }; static int environ_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ); } static ssize_t environ_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { char *page; unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; unsigned long env_start, env_end; /* Ensure the process spawned far enough to have an environment. */ if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; if (!mmget_not_zero(mm)) goto free; spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; int retval; if (src >= (env_end - env_start)) break; this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; break; } if (copy_to_user(buf, page, retval)) { ret = -EFAULT; break; } ret += retval; src += retval; buf += retval; count -= retval; } *ppos = src; mmput(mm); free: free_page((unsigned long) page); return ret; } static const struct file_operations proc_environ_operations = { .open = environ_open, .read = environ_read, .llseek = generic_file_llseek, .release = mem_release, }; static int auxv_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); } static ssize_t auxv_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; if (!mm) return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); } static const struct file_operations proc_auxv_operations = { .open = auxv_open, .read = auxv_read, .llseek = generic_file_llseek, .release = mem_release, }; static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; if (!task) return -ESRCH; if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) oom_adj = OOM_ADJUST_MAX; else oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); if (oom_adj > OOM_ADJUST_MAX) oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mutex_lock(&oom_adj_mutex); if (legacy) { if (oom_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } /* * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead. */ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); } else { if ((short)oom_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } } /* * Make sure we will check other processes sharing the mm if this is * not vfrok which wants its own oom_score_adj. * pin the mm so it doesn't go away and get reused after task_unlock */ if (!task->vfork_done) { struct task_struct *p = find_lock_task_mm(task); if (p) { if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } task_unlock(p); } } task->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = (short)oom_adj; trace_oom_score_adj_update(task); if (mm) { struct task_struct *p; rcu_read_lock(); for_each_process(p) { if (same_thread_group(task, p)) continue; /* do not touch kernel threads or the global init */ if (p->flags & PF_KTHREAD || is_global_init(p)) continue; task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; } task_unlock(p); } rcu_read_unlock(); mmdrop(mm); } err_unlock: mutex_unlock(&oom_adj_mutex); put_task_struct(task); return err; } /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. * Values written to oom_adj are simply mapped linearly to oom_score_adj. * Processes that become oom disabled via oom_adj will still be oom disabled * with this implementation. * * oom_adj cannot be removed since existing userspace binaries use it. */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF]; int oom_adj; int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_adj); if (err) goto out; if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && oom_adj != OOM_DISABLE) { err = -EINVAL; goto out; } /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. */ if (oom_adj == OOM_ADJUST_MAX) oom_adj = OOM_SCORE_ADJ_MAX; else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_adj_operations = { .read = oom_adj_read, .write = oom_adj_write, .llseek = generic_file_llseek, }; static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; size_t len; if (!task) return -ESRCH; oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF]; int oom_score_adj; int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || oom_score_adj > OOM_SCORE_ADJ_MAX) { err = -EINVAL; goto out; } err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { .read = oom_score_adj_read, .write = oom_score_adj_write, .llseek = default_llseek, }; #ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", from_kuid(file->f_cred->user_ns, audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); uid_t loginuid; kuid_t kloginuid; int rv; /* Don't let kthreads write their own loginuid */ if (current->flags & PF_KTHREAD) return -EPERM; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); return -EPERM; } rcu_read_unlock(); if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } rv = kstrtou32_from_user(buf, count, 10, &loginuid); if (rv < 0) return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); if (!uid_valid(kloginuid)) return -EINVAL; } rv = audit_set_loginuid(kloginuid); if (rv < 0) return rv; return count; } static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, .llseek = generic_file_llseek, }; static ssize_t proc_sessionid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_sessionid(task)); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations proc_sessionid_operations = { .read = proc_sessionid_read, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_FAULT_INJECTION static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; size_t len; int make_it_fail; if (!task) return -ESRCH; make_it_fail = task->make_it_fail; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF]; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); if (rv < 0) return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; put_task_struct(task); return count; } static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; int err; unsigned int n; err = kstrtouint_from_user(buf, count, 0, &n); if (err) return err; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->fail_nth = n; put_task_struct(task); return count; } static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char numbuf[PROC_NUMBUF]; ssize_t len; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { .read = proc_fail_nth_read, .write = proc_fail_nth_write, }; #endif #ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_show_task(p, ns, m); put_task_struct(p); return 0; } static ssize_t sched_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_set_task(p); put_task_struct(p); return count; } static int sched_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_show, inode); } static const struct file_operations proc_pid_sched_operations = { .open = sched_open, .read = seq_read, .write = sched_write, .llseek = seq_lseek, .release = single_release, }; #endif #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: */ static int sched_autogroup_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_autogroup_show_task(p, m); put_task_struct(p); return 0; } static ssize_t sched_autogroup_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[PROC_NUMBUF]; int nice; int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; err = kstrtoint(strstrip(buffer), 0, &nice); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; put_task_struct(p); return count; } static int sched_autogroup_open(struct inode *inode, struct file *filp) { int ret; ret = single_open(filp, sched_autogroup_show, NULL); if (!ret) { struct seq_file *m = filp->private_data; m->private = inode; } return ret; } static const struct file_operations proc_pid_sched_autogroup_operations = { .open = sched_autogroup_open, .read = seq_read, .write = sched_autogroup_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_SCHED_AUTOGROUP */ #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { struct task_struct *p; p = get_proc_task(file_inode(m->file)); if (!p) return -ESRCH; proc_timens_show_offsets(p, m); put_task_struct(p); return 0; } static ssize_t timens_offsets_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct proc_timens_offset offsets[2]; char *kbuf = NULL, *pos, *next_line; struct task_struct *p; int ret, noffsets; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* Parse the user data */ ret = -EINVAL; noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; char clock[10]; int err; /* Find the end of line and ensure we don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } err = sscanf(pos, "%9s %lld %lu", clock, &off->val.tv_sec, &off->val.tv_nsec); if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) goto out; clock[sizeof(clock) - 1] = 0; if (strcmp(clock, "monotonic") == 0 || strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) off->clockid = CLOCK_MONOTONIC; else if (strcmp(clock, "boottime") == 0 || strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) off->clockid = CLOCK_BOOTTIME; else goto out; noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line) count = next_line - kbuf; break; } } ret = -ESRCH; p = get_proc_task(inode); if (!p) goto out; ret = proc_timens_set_offset(file, p, offsets, noffsets); put_task_struct(p); if (ret) goto out; ret = count; out: kfree(kbuf); return ret; } static int timens_offsets_open(struct inode *inode, struct file *filp) { return single_open(filp, timens_offsets_show, inode); } static const struct file_operations proc_timens_offsets_operations = { .open = timens_offsets_open, .read = seq_read, .write = timens_offsets_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_TIME_NS */ static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN]; const size_t maxlen = sizeof(buffer) - 1; memset(buffer, 0, sizeof(buffer)); if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); if (!p) return -ESRCH; if (same_thread_group(current, p)) { set_task_comm(p, buffer); proc_comm_connector(p); } else count = -EINVAL; put_task_struct(p); return count; } static int comm_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_task_name(m, p, false); seq_putc(m, '\n'); put_task_struct(p); return 0; } static int comm_open(struct inode *inode, struct file *filp) { return single_open(filp, comm_show, inode); } static const struct file_operations proc_pid_set_comm_operations = { .open = comm_open, .read = seq_read, .write = comm_write, .llseek = seq_lseek, .release = single_release, }; static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; exe_file = get_task_exe_file(task); put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); fput(exe_file); return 0; } else return -ENOENT; } static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = nd_jump_link(&path); out: return ERR_PTR(error); } static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { char *tmp = (char *)__get_free_page(GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; pathname = d_path(path, tmp, PAGE_SIZE); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; len = tmp + PAGE_SIZE - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: free_page((unsigned long)tmp); return len; } static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = do_proc_readlink(&path, buffer, buflen); path_put(&path); out: return error; } const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_pid_get_link, .setattr = proc_setattr, }; /* building an inode */ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a * proc file for a task. */ const struct cred *cred; kuid_t uid; kgid_t gid; if (unlikely(task->flags & PF_KTHREAD)) { *ruid = GLOBAL_ROOT_UID; *rgid = GLOBAL_ROOT_GID; return; } /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); uid = cred->euid; gid = cred->egid; rcu_read_unlock(); /* * Before the /proc/pid/status file was created the only way to read * the effective uid of a /process was to stat /proc/pid. Reading * /proc/pid/status is slow enough that procps and other packages * kept stating /proc/pid. To keep the rules in /proc simple I have * made this apply to all per process world readable and executable * directories. */ if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { struct mm_struct *mm; task_lock(task); mm = task->mm; /* Make non-dumpable tasks owned by some root */ if (mm) { if (get_dumpable(mm) != SUID_DUMP_USER) { struct user_namespace *user_ns = mm->user_ns; uid = make_kuid(user_ns, 0); if (!uid_valid(uid)) uid = GLOBAL_ROOT_UID; gid = make_kgid(user_ns, 0); if (!gid_valid(gid)) gid = GLOBAL_ROOT_GID; } } else { uid = GLOBAL_ROOT_UID; gid = GLOBAL_ROOT_GID; } task_unlock(task); } *ruid = uid; *rgid = gid; } void proc_pid_evict_inode(struct proc_inode *ei) { struct pid *pid = ei->pid; if (S_ISDIR(ei->vfs_inode.i_mode)) { spin_lock(&pid->lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } put_pid(pid); } struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; struct pid *pid; /* We need a new inode */ inode = new_inode(sb); if (!inode) goto out; /* Common stuff */ ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ pid = get_task_pid(task, PIDTYPE_PID); if (!pid) goto out_unlock; /* Let the pid remember us for quick removal */ ei->pid = pid; task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); out: return inode; out_unlock: iput(inode); return NULL; } /* * Generating an inode and adding it into @pid->inodes, so that task will * invalidate inode's dentry before being released. * * This helper is used for creating dir-type entries under '/proc' and * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' * can be released by invalidating '/proc/<tgid>' dentry. * In theory, dentries under '/proc/<tgid>/task' can also be released by * invalidating '/proc/<tgid>' dentry, we reserve it to handle single * thread exiting situation: Any one of threads should invalidate its * '/proc/<tgid>/task/<pid>' dentry before released. */ static struct inode *proc_pid_make_base_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode *inode; struct proc_inode *ei; struct pid *pid; inode = proc_pid_make_inode(sb, task, mode); if (!inode) return NULL; /* Let proc_flush_pid find this directory inode */ ei = PROC_I(inode); pid = ei->pid; spin_lock(&pid->lock); hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); spin_unlock(&pid->lock); return inode; } int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; generic_fillattr(&init_user_ns, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, * it only makes getattr() consistent with readdir(). */ return -ENOENT; } task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); } rcu_read_unlock(); return 0; } /* dentry stuff */ /* * Set <pid>/... inode ownership (can change due to setuid(), etc.) */ void pid_update_inode(struct task_struct *task, struct inode *inode) { task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); } /* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ static int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; if (flags & LOOKUP_RCU) return -ECHILD; inode = d_inode(dentry); task = get_proc_task(inode); if (task) { pid_update_inode(task, inode); put_task_struct(task); return 1; } return 0; } static inline bool proc_inode_is_dead(struct inode *inode) { return !proc_pid(inode)->tasks[PIDTYPE_PID].first; } int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, }; /* Lookups */ /* * Fill a directory entry. * * If possible create the dcache entry and derive our inode number and * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; unsigned type = DT_UNKNOWN; ino_t ino = 1; child = d_hash_and_lookup(dir, &qname); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { struct dentry *res; res = instantiate(child, task, ptr); d_lookup_done(child); if (unlikely(res)) { dput(child); child = res; if (IS_ERR(child)) goto end_instantiate; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); end_instantiate: return dir_emit(ctx, name, len, ino, type); } /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. */ static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { const char *str = dentry->d_name.name; unsigned long long sval, eval; unsigned int len; if (str[0] == '0' && str[1] != '-') return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (sval != (unsigned long)sval) return -EINVAL; str += len; if (*str != '-') return -EINVAL; str++; if (str[0] == '0' && str[1]) return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (eval != (unsigned long)eval) return -EINVAL; str += len; if (*str != '\0') return -EINVAL; *start = sval; *end = eval; return 0; } static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; struct mm_struct *mm = NULL; struct task_struct *task; struct inode *inode; int status = 0; if (flags & LOOKUP_RCU) return -ECHILD; inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { status = mmap_read_lock_killable(mm); if (!status) { exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); mmap_read_unlock(mm); } } mmput(mm); if (exact_vma_exists) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); status = 1; } out: put_task_struct(task); out_notask: return status; } static const struct dentry_operations tid_map_files_dentry_operations = { .d_revalidate = map_files_d_revalidate, .d_delete = pid_delete_dentry, }; static int map_files_get_link(struct dentry *dentry, struct path *path) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; task = get_proc_task(d_inode(dentry)); if (!task) goto out; mm = get_task_mm(task); put_task_struct(task); if (!mm) goto out; rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); if (rc) goto out_mmput; rc = mmap_read_lock_killable(mm); if (rc) goto out_mmput; rc = -ENOENT; vma = find_exact_vma(m